From ae37ef649f2000b4b7b95e9118f8dafdf645e96e Mon Sep 17 00:00:00 2001
From: Lee Killough <killough@leekillough.com>
Date: Thu, 2 Apr 2020 18:43:34 -0400
Subject: [PATCH] Clean up text files

---
 .../Current-Release-Notes.rst                 |   48 +-
 Deep_learning/Deep-learning.rst               |   24 +-
 Deep_learning/GCN-asm-tutorial.rst            |   22 +-
 Deep_learning/MXNet.rst                       |   44 +-
 Deep_learning/caffe.rst                       |   50 +-
 Deep_learning/hipCaffe .rst                   |   62 +-
 Doxyfile                                      |    2 +-
 FAQ/FAQ_HIP.rst                               |   42 +-
 GCN_ISA_Manuals/GCN-ISA-Manuals.rst           |   24 +-
 GCN_ISA_Manuals/PCIe-features.rst             |   50 +-
 GCN_ISA_Manuals/caffe.rst                     |   48 +-
 GCN_ISA_Manuals/testdocbook.rst               |  172 +--
 Installation_Guide/FAQ-on-Installation.rst    |   34 +-
 Installation_Guide/HCC-Compiler.rst           |    2 +-
 Installation_Guide/HIP.rst                    |   14 +-
 Installation_Guide/Installation-Guide.rst     |  136 +--
 ...ist-of-ROCm-Packages-for-Ubuntu-Fedora.rst |   38 +-
 .../More-about-how-ROCm-uses-PCIe-Atomics.rst |   52 +-
 .../Quick Start Installation Guide.rst        |   54 +-
 Installation_Guide/QuickStartGuideOpenCL.rst  |   20 +-
 Installation_Guide/ROC-smi.rst                |   32 +-
 .../ROCK-Kernel-Driver_readme.rst             |    2 +-
 Installation_Guide/ROCR-Runtime.rst           |    2 +-
 Installation_Guide/ROCk-kernel.rst            |   10 +-
 Installation_Guide/atmi.rst                   |   10 +-
 Other_Solutions/Other-Solutions.rst           |   30 +-
 Other_Solutions/PCIe-Debug.rst                |   24 +-
 Other_Solutions/ROCm_PCIe_Debug.md            |   24 +-
 Programming_Guides/CUDAAPIHIP.rst             |    4 +-
 Programming_Guides/CUDAAPIHIPTEXTURE.rst      |    2 +-
 Programming_Guides/HIP-FAQ.rst                |    6 +-
 Programming_Guides/HIP-GUIDE.rst              |  508 ++++----
 Programming_Guides/HIP-porting-guide.rst      |  160 +--
 Programming_Guides/HIP_Debugging.rst          |   32 +-
 Programming_Guides/Kernel_language.rst        |  596 +++++-----
 Programming_Guides/LanguageInto.rst           |   14 +-
 Programming_Guides/Opencl-optimization.rst    |  582 +++++-----
 .../Opencl-programming-guide.rst              |  640 +++++-----
 Programming_Guides/Programming-Guides.rst     |   24 +-
 Programming_Guides/hcc-guide.rst              |    2 +-
 Programming_Guides/hcc-profile.rst            |    8 +-
 Programming_Guides/hip-programming-guide.rst  |    4 +-
 Programming_Guides/hip-programming.rst        |    6 +-
 Programming_Guides/hip_install.rst            |    4 +-
 Programming_Guides/hip_port.rst               |   14 +-
 Programming_Guides/hip_profiling.rst          |   58 +-
 Programming_Guides/hipporting-driver-api.rst  |   84 +-
 README.md                                     |    6 +-
 ROCm.rst                                      |   36 +-
 ROCm_API_References/BLAS1.rst                 |   16 +-
 ROCm_API_References/BLAS2.rst                 |   20 +-
 ROCm_API_References/BLAS3.rst                 |   12 +-
 ROCm_API_References/HCC-API.rst               |    4 +-
 ROCm_API_References/HIP-MATH.rst              | 1012 ++++++++--------
 .../HIP_API/Context-Management.rst            |   50 +-
 ROCm_API_References/HIP_API/Control.rst       |    4 +-
 .../HIP_API/Device-Memory-Access.rst          |    8 +-
 .../HIP_API/Device-management.rst             |   24 +-
 ROCm_API_References/HIP_API/Error.rst         |   12 +-
 .../HIP_API/Event-Management.rst              |   10 +-
 .../HIP_API/Initialization-and-Version.rst    |   24 +-
 .../HIP_API/Memory-Management.rst             |   40 +-
 .../HIP_API/Stream-Management.rst             |   16 +-
 ROCm_API_References/ROCr-API.rst              |   10 +-
 ROCm_API_References/Thrust.rst                |   44 +-
 ROCm_API_References/api.rst                   |    8 +-
 ROCm_API_References/clBLAS.rst                |    8 +-
 ROCm_API_References/clSPARSE_API.rst          |    6 +-
 ROCm_API_References/clSPARSE_api.rst          |   10 +-
 ROCm_API_References/rocBLAS.rst               |    4 +-
 ROCm_Audio_Video_Tutorials/ROCm_videos.rst    |    4 +-
 ROCm_Compiler_SDK/ROCm-Codeobj-format.rst     |   36 +-
 ROCm_Compiler_SDK/ROCm-Compiler-SDK.rst       |    4 +-
 ROCm_Compiler_SDK/ROCm-Native-ISA.rst         |  462 ++++----
 ROCm_Compiler_SDK/ocml.rst                    |   30 +-
 ROCm_Glossary/ROCm-Glossary.rst               |    4 +-
 ROCm_Libraries/ROCm_Libraries.rst             |  192 +--
 ROCm_Libraries/dep-lib.rst                    |   80 +-
 ROCm_Libraries/hipsparse_wiki.rst             |   84 +-
 ROCm_Libraries/rocALUTION/Doxyfile            |  126 +-
 .../rocALUTION/src/base/base_matrix.hpp       |    2 +-
 .../rocALUTION/src/base/host/CMakeLists.txt   |    2 +-
 .../src/base/host/host_matrix_csr.cpp         |    2 +-
 .../solvers/multigrid/ruge_stueben_amg.cpp    |    4 +-
 .../preconditioners/preconditioner.hpp        |    2 +-
 .../preconditioners/preconditioner_ai.hpp     |    4 +-
 ROCm_Libraries/rocBLAS/Doxyfile               |  124 +-
 .../rocBLAS/src/include/rocblas-functions.h   |   14 +-
 .../rocBLAS/src/src/blas1/rocblas_copy.cpp    |    2 +-
 .../rocBLAS/src/src/blas1/rocblas_scal.cpp    |    2 +-
 .../rocBLAS/src/src/blas1/rocblas_swap.cpp    |    2 +-
 .../src/src/blas_ex/rocblas_gemm_ex.hpp       |   32 +-
 ROCm_Libraries/rocBLAS/src/src/buildinfo.cpp  |   82 +-
 ROCm_Libraries/rocFFT/Doxyfile                |  124 +-
 ROCm_Libraries/rocSOLVER/API.rst              |    8 +-
 ROCm_Libraries/rocSOLVER/Doxyfile             |  124 +-
 ROCm_Libraries/rocSOLVER/Introduction.rst     |   84 +-
 ROCm_Libraries/rocSOLVER/Jenkinsfile          |   10 +-
 ROCm_Libraries/rocSOLVER/LICENSE.md           |    2 +-
 .../rocSOLVER/bump_develop_version.sh         |    4 +-
 .../rocSOLVER/bump_master_version.sh          |    2 +-
 .../rocSOLVER/cmake/get-cli-arguments.cmake   |    2 +-
 ROCm_Libraries/rocSOLVER/debian/postinst      |    1 -
 ROCm_Libraries/rocSOLVER/debian/prerm         |    1 -
 .../rocSOLVER/deps/external-lapack.cmake      |    2 +-
 ROCm_Libraries/rocSOLVER/docs/Doxyfile        |  124 +-
 .../library/include/rocsolver-functions.h     | 1026 ++++++++---------
 .../docs/library/include/rocsolver-types.h    |   20 +-
 .../rocSOLVER/docs/library/src/CMakeLists.txt |    2 +-
 .../src/auxiliary/rocauxiliary_larf.cpp       |   10 +-
 .../src/auxiliary/rocauxiliary_larf.hpp       |   12 +-
 .../src/auxiliary/rocauxiliary_larfb.cpp      |   12 +-
 .../src/auxiliary/rocauxiliary_larfb.hpp      |   58 +-
 .../src/auxiliary/rocauxiliary_larfg.cpp      |    2 +-
 .../src/auxiliary/rocauxiliary_larfg.hpp      |   16 +-
 .../src/auxiliary/rocauxiliary_larft.cpp      |    6 +-
 .../src/auxiliary/rocauxiliary_larft.hpp      |   38 +-
 .../src/auxiliary/rocauxiliary_laswp.cpp      |    4 +-
 .../src/auxiliary/rocauxiliary_laswp.hpp      |    6 +-
 .../src/auxiliary/rocauxiliary_org2r.cpp      |    2 +-
 .../src/auxiliary/rocauxiliary_org2r.hpp      |   32 +-
 .../src/auxiliary/rocauxiliary_orgbr.cpp      |    4 +-
 .../src/auxiliary/rocauxiliary_orgbr.hpp      |   70 +-
 .../src/auxiliary/rocauxiliary_orgl2.cpp      |    2 +-
 .../src/auxiliary/rocauxiliary_orgl2.hpp      |   32 +-
 .../src/auxiliary/rocauxiliary_orglq.cpp      |    2 +-
 .../src/auxiliary/rocauxiliary_orglq.hpp      |   40 +-
 .../src/auxiliary/rocauxiliary_orgqr.cpp      |    2 +-
 .../src/auxiliary/rocauxiliary_orgqr.hpp      |   40 +-
 .../src/auxiliary/rocauxiliary_orm2r.cpp      |    6 +-
 .../src/auxiliary/rocauxiliary_orm2r.hpp      |   18 +-
 .../src/auxiliary/rocauxiliary_ormqr.cpp      |    6 +-
 .../src/auxiliary/rocauxiliary_ormqr.hpp      |   14 +-
 .../docs/library/src/common/rocblas.cpp       |    2 +-
 .../library/src/include/common_device.hpp     |    8 +-
 .../docs/library/src/include/ideal_sizes.hpp  |    2 +-
 .../src/include/rocsolver_unique_ptr.hpp      |   48 +-
 .../library/src/lapack/roclapack_gelq2.cpp    |   14 +-
 .../library/src/lapack/roclapack_gelq2.hpp    |   18 +-
 .../src/lapack/roclapack_gelq2_batched.cpp    |   14 +-
 .../roclapack_gelq2_strided_batched.cpp       |   14 +-
 .../library/src/lapack/roclapack_gelqf.cpp    |   14 +-
 .../library/src/lapack/roclapack_gelqf.hpp    |   24 +-
 .../src/lapack/roclapack_gelqf_batched.cpp    |   14 +-
 .../roclapack_gelqf_strided_batched.cpp       |   14 +-
 .../library/src/lapack/roclapack_geqr2.cpp    |   14 +-
 .../library/src/lapack/roclapack_geqr2.hpp    |   18 +-
 .../src/lapack/roclapack_geqr2_batched.cpp    |   14 +-
 .../roclapack_geqr2_strided_batched.cpp       |   14 +-
 .../library/src/lapack/roclapack_geqrf.cpp    |   14 +-
 .../library/src/lapack/roclapack_geqrf.hpp    |   24 +-
 .../src/lapack/roclapack_geqrf_batched.cpp    |   14 +-
 .../roclapack_geqrf_strided_batched.cpp       |   14 +-
 .../library/src/lapack/roclapack_getf2.cpp    |   18 +-
 .../library/src/lapack/roclapack_getf2.hpp    |   22 +-
 .../src/lapack/roclapack_getf2_batched.cpp    |   18 +-
 .../roclapack_getf2_strided_batched.cpp       |   20 +-
 .../library/src/lapack/roclapack_getrf.cpp    |   14 +-
 .../library/src/lapack/roclapack_getrf.hpp    |   14 +-
 .../src/lapack/roclapack_getrf_batched.cpp    |   16 +-
 .../roclapack_getrf_strided_batched.cpp       |   14 +-
 .../library/src/lapack/roclapack_getrs.cpp    |   16 +-
 .../library/src/lapack/roclapack_getrs.hpp    |    8 +-
 .../src/lapack/roclapack_getrs_batched.cpp    |   18 +-
 .../roclapack_getrs_strided_batched.cpp       |   18 +-
 .../library/src/lapack/roclapack_potf2.cpp    |   12 +-
 .../library/src/lapack/roclapack_potf2.hpp    |   22 +-
 .../src/lapack/roclapack_potf2_batched.cpp    |   14 +-
 .../roclapack_potf2_strided_batched.cpp       |   14 +-
 .../library/src/lapack/roclapack_potrf.cpp    |   12 +-
 .../library/src/lapack/roclapack_potrf.hpp    |   28 +-
 .../src/lapack/roclapack_potrf_batched.cpp    |   14 +-
 .../roclapack_potrf_strided_batched.cpp       |   14 +-
 .../library/src/rocsolver-config.cmake.in     |    2 +-
 ROCm_Libraries/rocSOLVER/docs/source/api.rst  |    8 +-
 .../rocSOLVER/docs/source/index.rst           |    4 +-
 .../rocSOLVER/docs/source/library.rst         |   90 +-
 ROCm_Libraries/rocSOLVER/index.rst            |    4 +-
 .../library/include/rocsolver-functions.h     | 1026 ++++++++---------
 .../library/include/rocsolver-types.h         |   20 +-
 .../rocSOLVER/library/src/CMakeLists.txt      |    2 +-
 .../src/auxiliary/rocauxiliary_larf.cpp       |   10 +-
 .../src/auxiliary/rocauxiliary_larf.hpp       |   12 +-
 .../src/auxiliary/rocauxiliary_larfb.cpp      |   12 +-
 .../src/auxiliary/rocauxiliary_larfb.hpp      |   58 +-
 .../src/auxiliary/rocauxiliary_larfg.cpp      |    2 +-
 .../src/auxiliary/rocauxiliary_larfg.hpp      |   16 +-
 .../src/auxiliary/rocauxiliary_larft.cpp      |    6 +-
 .../src/auxiliary/rocauxiliary_larft.hpp      |   38 +-
 .../src/auxiliary/rocauxiliary_laswp.cpp      |    4 +-
 .../src/auxiliary/rocauxiliary_laswp.hpp      |    6 +-
 .../src/auxiliary/rocauxiliary_org2r.cpp      |    2 +-
 .../src/auxiliary/rocauxiliary_org2r.hpp      |   32 +-
 .../src/auxiliary/rocauxiliary_orgbr.cpp      |    4 +-
 .../src/auxiliary/rocauxiliary_orgbr.hpp      |   70 +-
 .../src/auxiliary/rocauxiliary_orgl2.cpp      |    2 +-
 .../src/auxiliary/rocauxiliary_orgl2.hpp      |   32 +-
 .../src/auxiliary/rocauxiliary_orglq.cpp      |    2 +-
 .../src/auxiliary/rocauxiliary_orglq.hpp      |   40 +-
 .../src/auxiliary/rocauxiliary_orgqr.cpp      |    2 +-
 .../src/auxiliary/rocauxiliary_orgqr.hpp      |   40 +-
 .../src/auxiliary/rocauxiliary_orm2r.cpp      |    6 +-
 .../src/auxiliary/rocauxiliary_orm2r.hpp      |   18 +-
 .../src/auxiliary/rocauxiliary_ormqr.cpp      |    6 +-
 .../src/auxiliary/rocauxiliary_ormqr.hpp      |   14 +-
 .../rocSOLVER/library/src/common/rocblas.cpp  |    2 +-
 .../library/src/include/common_device.hpp     |    8 +-
 .../library/src/include/ideal_sizes.hpp       |    2 +-
 .../src/include/rocsolver_unique_ptr.hpp      |   48 +-
 .../library/src/lapack/roclapack_gelq2.cpp    |   14 +-
 .../library/src/lapack/roclapack_gelq2.hpp    |   18 +-
 .../src/lapack/roclapack_gelq2_batched.cpp    |   14 +-
 .../roclapack_gelq2_strided_batched.cpp       |   14 +-
 .../library/src/lapack/roclapack_gelqf.cpp    |   14 +-
 .../library/src/lapack/roclapack_gelqf.hpp    |   24 +-
 .../src/lapack/roclapack_gelqf_batched.cpp    |   14 +-
 .../roclapack_gelqf_strided_batched.cpp       |   14 +-
 .../library/src/lapack/roclapack_geqr2.cpp    |   14 +-
 .../library/src/lapack/roclapack_geqr2.hpp    |   18 +-
 .../src/lapack/roclapack_geqr2_batched.cpp    |   14 +-
 .../roclapack_geqr2_strided_batched.cpp       |   14 +-
 .../library/src/lapack/roclapack_geqrf.cpp    |   14 +-
 .../library/src/lapack/roclapack_geqrf.hpp    |   24 +-
 .../src/lapack/roclapack_geqrf_batched.cpp    |   14 +-
 .../roclapack_geqrf_strided_batched.cpp       |   14 +-
 .../library/src/lapack/roclapack_getf2.cpp    |   18 +-
 .../library/src/lapack/roclapack_getf2.hpp    |   22 +-
 .../src/lapack/roclapack_getf2_batched.cpp    |   18 +-
 .../roclapack_getf2_strided_batched.cpp       |   20 +-
 .../library/src/lapack/roclapack_getrf.cpp    |   14 +-
 .../library/src/lapack/roclapack_getrf.hpp    |   14 +-
 .../src/lapack/roclapack_getrf_batched.cpp    |   16 +-
 .../roclapack_getrf_strided_batched.cpp       |   14 +-
 .../library/src/lapack/roclapack_getrs.cpp    |   16 +-
 .../library/src/lapack/roclapack_getrs.hpp    |    8 +-
 .../src/lapack/roclapack_getrs_batched.cpp    |   18 +-
 .../roclapack_getrs_strided_batched.cpp       |   18 +-
 .../library/src/lapack/roclapack_potf2.cpp    |   12 +-
 .../library/src/lapack/roclapack_potf2.hpp    |   22 +-
 .../src/lapack/roclapack_potf2_batched.cpp    |   14 +-
 .../roclapack_potf2_strided_batched.cpp       |   14 +-
 .../library/src/lapack/roclapack_potrf.cpp    |   12 +-
 .../library/src/lapack/roclapack_potrf.hpp    |   28 +-
 .../src/lapack/roclapack_potrf_batched.cpp    |   14 +-
 .../roclapack_potrf_strided_batched.cpp       |   14 +-
 .../library/src/rocsolver-config.cmake.in     |    2 +-
 ROCm_Libraries/rocSOLVER/src/CMakeLists.txt   |    2 +-
 .../src/auxiliary/rocauxiliary_larf.cpp       |   10 +-
 .../src/auxiliary/rocauxiliary_larf.hpp       |   12 +-
 .../src/auxiliary/rocauxiliary_larfb.cpp      |   12 +-
 .../src/auxiliary/rocauxiliary_larfb.hpp      |   58 +-
 .../src/auxiliary/rocauxiliary_larfg.cpp      |    2 +-
 .../src/auxiliary/rocauxiliary_larfg.hpp      |   16 +-
 .../src/auxiliary/rocauxiliary_larft.cpp      |    6 +-
 .../src/auxiliary/rocauxiliary_larft.hpp      |   38 +-
 .../src/auxiliary/rocauxiliary_laswp.cpp      |    4 +-
 .../src/auxiliary/rocauxiliary_laswp.hpp      |    6 +-
 .../src/auxiliary/rocauxiliary_org2r.cpp      |    2 +-
 .../src/auxiliary/rocauxiliary_org2r.hpp      |   32 +-
 .../src/auxiliary/rocauxiliary_orgbr.cpp      |    4 +-
 .../src/auxiliary/rocauxiliary_orgbr.hpp      |   70 +-
 .../src/auxiliary/rocauxiliary_orgl2.cpp      |    2 +-
 .../src/auxiliary/rocauxiliary_orgl2.hpp      |   32 +-
 .../src/auxiliary/rocauxiliary_orglq.cpp      |    2 +-
 .../src/auxiliary/rocauxiliary_orglq.hpp      |   40 +-
 .../src/auxiliary/rocauxiliary_orgqr.cpp      |    2 +-
 .../src/auxiliary/rocauxiliary_orgqr.hpp      |   40 +-
 .../src/auxiliary/rocauxiliary_orm2r.cpp      |    6 +-
 .../src/auxiliary/rocauxiliary_orm2r.hpp      |   18 +-
 .../src/auxiliary/rocauxiliary_ormqr.cpp      |    6 +-
 .../src/auxiliary/rocauxiliary_ormqr.hpp      |   14 +-
 .../rocSOLVER/src/common/rocblas.cpp          |    2 +-
 .../rocSOLVER/src/include/common_device.hpp   |    8 +-
 .../rocSOLVER/src/include/ideal_sizes.hpp     |    2 +-
 .../src/include/rocsolver_unique_ptr.hpp      |   48 +-
 .../rocSOLVER/src/lapack/roclapack_gelq2.cpp  |   14 +-
 .../rocSOLVER/src/lapack/roclapack_gelq2.hpp  |   18 +-
 .../src/lapack/roclapack_gelq2_batched.cpp    |   14 +-
 .../roclapack_gelq2_strided_batched.cpp       |   14 +-
 .../rocSOLVER/src/lapack/roclapack_gelqf.cpp  |   14 +-
 .../rocSOLVER/src/lapack/roclapack_gelqf.hpp  |   24 +-
 .../src/lapack/roclapack_gelqf_batched.cpp    |   14 +-
 .../roclapack_gelqf_strided_batched.cpp       |   14 +-
 .../rocSOLVER/src/lapack/roclapack_geqr2.cpp  |   14 +-
 .../rocSOLVER/src/lapack/roclapack_geqr2.hpp  |   18 +-
 .../src/lapack/roclapack_geqr2_batched.cpp    |   14 +-
 .../roclapack_geqr2_strided_batched.cpp       |   14 +-
 .../rocSOLVER/src/lapack/roclapack_geqrf.cpp  |   14 +-
 .../rocSOLVER/src/lapack/roclapack_geqrf.hpp  |   24 +-
 .../src/lapack/roclapack_geqrf_batched.cpp    |   14 +-
 .../roclapack_geqrf_strided_batched.cpp       |   14 +-
 .../rocSOLVER/src/lapack/roclapack_getf2.cpp  |   18 +-
 .../rocSOLVER/src/lapack/roclapack_getf2.hpp  |   22 +-
 .../src/lapack/roclapack_getf2_batched.cpp    |   18 +-
 .../roclapack_getf2_strided_batched.cpp       |   20 +-
 .../rocSOLVER/src/lapack/roclapack_getrf.cpp  |   14 +-
 .../rocSOLVER/src/lapack/roclapack_getrf.hpp  |   14 +-
 .../src/lapack/roclapack_getrf_batched.cpp    |   16 +-
 .../roclapack_getrf_strided_batched.cpp       |   14 +-
 .../rocSOLVER/src/lapack/roclapack_getrs.cpp  |   16 +-
 .../rocSOLVER/src/lapack/roclapack_getrs.hpp  |    8 +-
 .../src/lapack/roclapack_getrs_batched.cpp    |   18 +-
 .../roclapack_getrs_strided_batched.cpp       |   18 +-
 .../rocSOLVER/src/lapack/roclapack_potf2.cpp  |   12 +-
 .../rocSOLVER/src/lapack/roclapack_potf2.hpp  |   22 +-
 .../src/lapack/roclapack_potf2_batched.cpp    |   14 +-
 .../roclapack_potf2_strided_batched.cpp       |   14 +-
 .../rocSOLVER/src/lapack/roclapack_potrf.cpp  |   12 +-
 .../rocSOLVER/src/lapack/roclapack_potrf.hpp  |   28 +-
 .../src/lapack/roclapack_potrf_batched.cpp    |   14 +-
 .../roclapack_potrf_strided_batched.cpp       |   14 +-
 .../rocSOLVER/src/rocsolver-config.cmake.in   |    2 +-
 ROCm_Libraries/rocSPARSE/Doxyfile             |  124 +-
 ROCm_Libraries/rocr/Doxyfile                  |  122 +-
 ROCm_Libraries/rocr/src/README.md             |    4 +-
 .../src/cmake_modules/COPYING-CMAKE-SCRIPTS   |    2 +-
 .../rocr/src/cmake_modules/utils.cmake        |   12 +-
 ROCm_Libraries/rocr/src/core/common/shared.h  |   16 +-
 ROCm_Libraries/rocr/src/core/inc/agent.h      |   16 +-
 .../rocr/src/core/inc/amd_blit_kernel.h       |   16 +-
 .../rocr/src/core/inc/amd_blit_sdma.h         |   16 +-
 .../rocr/src/core/inc/amd_cpu_agent.h         |   16 +-
 .../rocr/src/core/inc/amd_elf_image.hpp       |   16 +-
 .../rocr/src/core/inc/amd_gpu_agent.h         |   16 +-
 .../rocr/src/core/inc/amd_hsa_code.hpp        |   16 +-
 .../rocr/src/core/inc/amd_hsa_loader.hpp      |   16 +-
 .../rocr/src/core/inc/amd_loader_context.hpp  |   16 +-
 .../rocr/src/core/inc/amd_memory_region.h     |   16 +-
 .../rocr/src/core/inc/amd_topology.h          |   16 +-
 ROCm_Libraries/rocr/src/core/inc/blit.h       |   16 +-
 ROCm_Libraries/rocr/src/core/inc/checked.h    |   16 +-
 .../rocr/src/core/inc/hsa_api_trace_int.h     |   16 +-
 .../rocr/src/core/inc/hsa_ext_interface.h     |   16 +-
 .../rocr/src/core/inc/hsa_internal.h          |   16 +-
 .../rocr/src/core/inc/hsa_table_interface.h   |   16 +-
 ROCm_Libraries/rocr/src/core/inc/isa.h        |   16 +-
 .../rocr/src/core/inc/memory_region.h         |   16 +-
 ROCm_Libraries/rocr/src/core/inc/registers.h  |   16 +-
 .../rocr/src/core/runtime/amd_blit_kernel.cpp |   16 +-
 .../rocr/src/core/runtime/amd_cpu_agent.cpp   |   22 +-
 .../rocr/src/core/runtime/amd_gpu_agent.cpp   |    8 +-
 .../src/core/runtime/amd_loader_context.cpp   |   16 +-
 .../src/core/runtime/amd_memory_region.cpp    |   16 +-
 .../src/core/runtime/hsa_ext_interface.cpp    |   40 +-
 .../src/core/runtime/interrupt_signal.cpp     |    2 +-
 ROCm_Libraries/rocr/src/core/runtime/isa.cpp  |   16 +-
 .../rocr/src/core/runtime/runtime.cpp         |    2 +-
 .../rocr/src/core/runtime/signal.cpp          |   16 +-
 .../rocr/src/core/util/atomic_helpers.h       |   16 +-
 .../rocr/src/core/util/lnx/os_linux.cpp       |   16 +-
 ROCm_Libraries/rocr/src/core/util/locks.h     |   16 +-
 ROCm_Libraries/rocr/src/core/util/os.h        |   16 +-
 .../rocr/src/core/util/small_heap.cpp         |   16 +-
 .../rocr/src/core/util/small_heap.h           |   18 +-
 ROCm_Libraries/rocr/src/core/util/timer.cpp   |   16 +-
 ROCm_Libraries/rocr/src/core/util/timer.h     |   16 +-
 ROCm_Libraries/rocr/src/core/util/utils.h     |   16 +-
 ROCm_Libraries/rocr/src/inc/amd_hsa_common.h  |   16 +-
 ROCm_Libraries/rocr/src/inc/amd_hsa_elf.h     |   16 +-
 .../rocr/src/inc/amd_hsa_kernel_code.h        |   16 +-
 ROCm_Libraries/rocr/src/inc/amd_hsa_signal.h  |   16 +-
 ROCm_Libraries/rocr/src/inc/hsa.h             |   22 +-
 ROCm_Libraries/rocr/src/inc/hsa_ext_amd.h     |    2 +-
 .../rocr/src/inc/hsa_ext_finalize.h           |   16 +-
 ROCm_Libraries/rocr/src/inc/hsa_ext_image.h   |   28 +-
 .../rocr/src/inc/hsa_ven_amd_aqlprofile.h     |   10 +-
 .../rocr/src/libamdhsacode/amd_elf_image.cpp  |    2 +-
 ROCm_Libraries/rocr/src/loader/loaders.hpp    |    2 +-
 ROCm_Network_Based_Programing/ROCm_RDMA.rst   |    4 +-
 ROCm_Solutions/ROCr_Error_Codes.rst           |    2 +-
 .../ROCm-System-Managment.rst                 |  126 +-
 ROCm_System_Managment/topo1.rst               |    2 +-
 ROCm_System_Managment/topo2.rst               |   44 +-
 ROCm_Tools/HCC-Native-GCN-ISA.rst             |   10 +-
 ROCm_Tools/HCC_WIKI.rst                       |   28 +-
 ROCm_Tools/ROCm-Tools.rst                     |  330 +++---
 ROCm_Tools/clBLA.rst                          |    4 +-
 ROCm_Tools/clFFT.rst                          |    4 +-
 ROCm_Tools/clRNG.rst                          |    8 +-
 ROCm_Tools/clSPARSE.rst                       |    8 +-
 ROCm_Tools/hcFFT.rst                          |   20 +-
 ROCm_Tools/hcRNG.rst                          |   32 +-
 ROCm_Tools/hipBLAS.rst                        |    2 +-
 ROCm_Tools/hipeigen.rst                       |    4 +-
 ROCm_Tools/hipinstall.rst                     |    8 +-
 ROCm_Tools/rocFFT.rst                         |    2 +-
 ROCm_Tools/rocFFTAPI.rst                      |   10 +-
 ROCm_Tools/rocblaswiki.rst                    |   18 +-
 ROCm_Tools/rocm-debug.rst                     |   16 +-
 ROCm_Tools/tensile.rst                        |   18 +-
 ROCm_Tools/tutorial.rst                       |   16 +-
 .../ROCm-Virtualization-&-Containers.rst      |   46 +-
 .../ROCm-Virtualization-&-Containers.rst~     |  262 -----
 ROCm_Virtualization_Containers/quickstart.rst |    2 +-
 Remote_Device_Programming/Memoryhooks.rst     |    2 +-
 .../Performancemeasurement.rst                |    6 +-
 Remote_Device_Programming/PrintUCXinfo.rst    |   10 +-
 .../Remote-Device-Programming.rst             |   82 +-
 Remote_Device_Programming/UCP-Design.rst      |    2 +-
 Remote_Device_Programming/UCT-Design.rst      |    4 +-
 Remote_Device_Programming/logging.rst         |    4 +-
 Remote_Device_Programming/profiling.rst       |    4 +-
 Remote_Device_Programming/reference           |    2 +-
 .../sideprogresscompletion.rst                |    6 +-
 Tutorial/GCN-asm-tutorial.rst                 |   22 +-
 Tutorial/Optimizing-Dispatches.rst            |   14 +-
 Tutorial/ROCm-MultiGPU.rst                    |    4 +-
 Tutorial/Tutorial.rst                         |    6 +-
 Tutorial/caffe.rst                            |   50 +-
 Tutorial/hipCaffe .rst                        |   62 +-
 Tutorial/rocncloc.rst                         |   30 +-
 _templates/breadcrumbs.html                   |    2 +-
 amdstyles.css                                 |   16 +-
 cleanup_text.sh                               |   45 +
 conf.py                                       |    8 +-
 index.rst                                     |   30 +-
 outline.rst                                   |   40 +-
 417 files changed, 7359 insertions(+), 7578 deletions(-)
 delete mode 100644 ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.rst~
 create mode 100755 cleanup_text.sh

diff --git a/Current_Release_Notes/Current-Release-Notes.rst b/Current_Release_Notes/Current-Release-Notes.rst
index 55f644a8..a513330a 100644
--- a/Current_Release_Notes/Current-Release-Notes.rst
+++ b/Current_Release_Notes/Current-Release-Notes.rst
@@ -10,7 +10,7 @@ April 1st, 2020
 What Is ROCm?
 ==============
 
-ROCm is designed to be a universal platform for gpu-accelerated computing. This modular design allows hardware vendors to build drivers that support the ROCm framework. ROCm is also designed to integrate multiple programming languages and makes it easy to add support for other languages. 
+ROCm is designed to be a universal platform for gpu-accelerated computing. This modular design allows hardware vendors to build drivers that support the ROCm framework. ROCm is also designed to integrate multiple programming languages and makes it easy to add support for other languages.
 
 Note: You can also clone the source code for individual ROCm components from the GitHub repositories.
 
@@ -20,13 +20,13 @@ ROCm Components
 The following components for the ROCm platform are released and available for the v3.3
 release:
 
-• Drivers
+o Drivers
 
-• Tools
+o Tools
 
-• Libraries
+o Libraries
 
-• Source Code
+o Source Code
 
 You can access the latest supported version of drivers, tools, libraries, and source code for the ROCm platform at the following location:
 https://github.com/RadeonOpenCompute/ROCm
@@ -44,7 +44,7 @@ The ROCm v3.3.x platform is designed to support the following operating systems:
 
 * RHEL v7.7 (Using devtoolset-7 runtime support)
 
-* SLES 15 SP1 
+* SLES 15 SP1
 
 
 What\'s New in This Release
@@ -55,16 +55,16 @@ What\'s New in This Release
 
 Users can install and access multiple versions of the ROCm toolkit simultaneously.
 
-Previously, users could install only a single version of the ROCm toolkit. 
+Previously, users could install only a single version of the ROCm toolkit.
 
 Now, users have the option to install multiple versions simultaneously and toggle to the desired version of the ROCm toolkit. From the v3.3 release, multiple versions of ROCm packages can be installed in the */opt/rocm-<version>* folder.
- 
+
 **Prerequisites**
 ###############################
 
 Ensure the existing installations of ROCm, including */opt/rocm*, are completely removed before the v3.3 ROCm toolkit installation. The ROCm v3.3 package requires a clean installation.
 
-* To install a single instance of ROCm, use the rocm-dkms or rocm-dev packages to install all the required components. This creates a symbolic link */opt/rocm* pointing to the corresponding version of ROCm installed on the system. 
+* To install a single instance of ROCm, use the rocm-dkms or rocm-dev packages to install all the required components. This creates a symbolic link */opt/rocm* pointing to the corresponding version of ROCm installed on the system.
 
 * To install individual ROCm components, create the */opt/rocm* symbolic link pointing to the version of ROCm installed on the system. For example, *# ln -s /opt/rocm-3.3.0 /opt/rocm*
 
@@ -82,7 +82,7 @@ Review the following important notes:
 
 To install a single instance of the ROCm package, access the non-versioned packages. You must not install any components from the multi-instance set.
 
-For example, 
+For example,
 
 * rocm-dkms
 
@@ -96,7 +96,7 @@ A fresh installation or an upgrade of the single-version installation will remov
 
 **Multi Version Installation**
 
-* To install a multi-instance of the ROCm package, access the versioned packages and components. 
+* To install a multi-instance of the ROCm package, access the versioned packages and components.
 
 For example,
 
@@ -118,19 +118,19 @@ For example,
 
 .. image:: /Current_Release_Notes/MultiIns.png
 
-**IMPORTANT**: A single instance ROCm package cannot co-exist with the multi-instance package. 
+**IMPORTANT**: A single instance ROCm package cannot co-exist with the multi-instance package.
 
-**NOTE**: The multi-instance installation applies only to ROCm v3.3 and above. This package requires a fresh installation after the complete removal of existing ROCm packages. The multi-version installation is not backward compatible. 
+**NOTE**: The multi-instance installation applies only to ROCm v3.3 and above. This package requires a fresh installation after the complete removal of existing ROCm packages. The multi-version installation is not backward compatible.
 
 
 **GPU Process Information**
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-A new functionality to display process information for GPUs is available in this release. For example,  you can view the process details to determine if the GPU(s) must be reset. 
+A new functionality to display process information for GPUs is available in this release. For example,  you can view the process details to determine if the GPU(s) must be reset.
 
 To display the GPU process details, you can:
 
-* Invoke the API 
+* Invoke the API
 
 or
 
@@ -143,15 +143,15 @@ https://github.com/RadeonOpenCompute/rocm_smi_lib/blob/master/docs/ROCm_SMI_Manu
 **Support for 3D Pooling Layers**
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-AMD ROCm is enhanced to include support for 3D pooling layers. The implementation of 3D pooling layers now allows users to run 3D convolutional networks, such as ResNext3D, on AMD Radeon Instinct GPUs. 
+AMD ROCm is enhanced to include support for 3D pooling layers. The implementation of 3D pooling layers now allows users to run 3D convolutional networks, such as ResNext3D, on AMD Radeon Instinct GPUs.
 
 
 **ONNX Enhancements**
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Open Neural Network eXchange (ONNX) is a widely-used neural net exchange format. The AMD model compiler & optimizer support the pre-trained models in ONNX, NNEF, & Caffe formats. Currently, ONNX versions 1.3 and below are supported. 
+Open Neural Network eXchange (ONNX) is a widely-used neural net exchange format. The AMD model compiler & optimizer support the pre-trained models in ONNX, NNEF, & Caffe formats. Currently, ONNX versions 1.3 and below are supported.
 
-The AMD Neural Net Intermediate Representation (NNIR) is enhanced to handle the rapidly changing ONNX versions and its layers. 
+The AMD Neural Net Intermediate Representation (NNIR) is enhanced to handle the rapidly changing ONNX versions and its layers.
 
 .. image:: /Current_Release_Notes/onnx.png
 
@@ -164,12 +164,12 @@ Code Object Manager (Comgr) Functions
 
 The following Code Object Manager (Comgr) functions are deprecated.
 
-* `amd_comgr_action_info_set_options` 
-* `amd_comgr_action_info_get_options` 
+* `amd_comgr_action_info_set_options`
+* `amd_comgr_action_info_get_options`
 
-These functions were originally deprecated in version 1.3 of the Comgr library as they no longer support options with embedded spaces. 
+These functions were originally deprecated in version 1.3 of the Comgr library as they no longer support options with embedded spaces.
 
-The deprecated functions are now replaced with the array-oriented options API, which includes 
+The deprecated functions are now replaced with the array-oriented options API, which includes
 
 *	`amd_comgr_action_info_set_option_list`
 *	`amd_comgr_action_info_get_option_list_count`
@@ -179,9 +179,9 @@ The deprecated functions are now replaced with the array-oriented options API, w
 Hardware and Software Support Information
 ==========================================
 
-AMD ROCm is focused on using AMD GPUs to accelerate computational tasks such as machine learning, engineering workloads, and scientific computing. In order to focus our development efforts on these domains of interest, ROCm supports a targeted set of hardware configurations. 
+AMD ROCm is focused on using AMD GPUs to accelerate computational tasks such as machine learning, engineering workloads, and scientific computing. In order to focus our development efforts on these domains of interest, ROCm supports a targeted set of hardware configurations.
 
-For more information, see 
+For more information, see
 
 https://github.com/RadeonOpenCompute/ROCm
 
diff --git a/Deep_learning/Deep-learning.rst b/Deep_learning/Deep-learning.rst
index 5a6b65a3..20c9f43d 100644
--- a/Deep_learning/Deep-learning.rst
+++ b/Deep_learning/Deep-learning.rst
@@ -13,17 +13,17 @@ ROCm Tensorflow v1.14 Release
 We are excited to announce the release of ROCm enabled TensorFlow v1.14 for AMD GPUs.
 In this release we have the following features enabled on top of upstream TF1.14 enhancements:
     * We integrated ROCm RCCL library for mGPU communication, details in `RCCL github repo <https://github.com/ROCmSoftwarePlatform/rccl>`_
-    * XLA backend is enabled for AMD GPUs, the functionality is complete, performance optimization is in progress.  
+    * XLA backend is enabled for AMD GPUs, the functionality is complete, performance optimization is in progress.
 
 ROCm Tensorflow v2.0.0-beta1 Release
 *****************************
 In addition to Tensorflow v1.14 release, we also enabled Tensorflow v2.0.0-beta1 for AMD GPUs. The TF-ROCm 2.0.0-beta1 release supports Tensorflow V2 API.
-Both whl packages and docker containers are available below. 
+Both whl packages and docker containers are available below.
 
 Tensorflow Installation
 ***********************
 
-First, you’ll need to install the open-source ROCm 3.0 stack. Details can be found `here <https://github.com/RadeonOpenCompute/ROCm>`_
+First, you'll need to install the open-source ROCm 3.0 stack. Details can be found `here <https://github.com/RadeonOpenCompute/ROCm>`_
 
 
 Then, install these other relevant ROCm packages:
@@ -50,10 +50,10 @@ MIOpen
 
 ROCm MIOpen v2.0.1 Release
 *************************
-Announcing our new Foundation for Deep Learning acceleration MIOpen 2.0 which introduces support for Convolution Neural Network (CNN) acceleration — built to run on top of the ROCm software stack!
+Announcing our new Foundation for Deep Learning acceleration MIOpen 2.0 which introduces support for Convolution Neural Network (CNN) acceleration -- built to run on top of the ROCm software stack!
 
 This release includes the following:
-   
+
    * This release contains bug fixes and performance improvements.
    * Additionally, the convolution algorithm Implicit GEMM is now enabled by default
    * Known issues:
@@ -81,7 +81,7 @@ The `porting guide <https://github.com/dagamayank/ROCm.github.io/blob/master/doc
 
 The ROCm 3.0 has prebuilt packages for MIOpen
 ***********************************************
-Install the ROCm MIOpen implementation (assuming you already have the ‘rocm’ and ‘rocm-opencl-dev” package installed):
+Install the ROCm MIOpen implementation (assuming you already have the 'rocm' and 'rocm-opencl-dev" package installed):
 
 MIOpen can be installed on Ubuntu using
 
@@ -210,7 +210,7 @@ Option 2: Install using PyTorch upstream docker file
 3. Build PyTorch docker image:
 
 ::
-  
+
   cd pytorch/docker/caffe2/jenkins
   ./build.sh py2-clang7-rocmdeb-ubuntu16.04
 
@@ -292,7 +292,7 @@ Note: This will mount your host home directory on /data in the container.
 5. Clone pytorch master (on to the host):
 
 ::
-  
+
   cd ~
   git clone https://github.com/pytorch/pytorch.git or git clone https://github.com/ROCmSoftwarePlatform/pytorch.git
   cd pytorch
@@ -315,7 +315,7 @@ export HCC_AMDGPU_TARGET=gfx906
 then
 ::
 
-  USE_ROCM=1 MAX_JOBS=4 python setup.py install --user 
+  USE_ROCM=1 MAX_JOBS=4 python setup.py install --user
 
 UseMAX_JOBS=n to limit peak memory usage. If building fails try falling back to fewer jobs. 4 jobs assume available main memory of 16 GB or larger.
 
@@ -497,11 +497,11 @@ Tutorials
 **hipCaffe**
 
 * :ref:`caffe`
-  
+
 **MXNet**
-  
+
 * :ref:`mxnet`
- 
+
 
 
 
diff --git a/Deep_learning/GCN-asm-tutorial.rst b/Deep_learning/GCN-asm-tutorial.rst
index 22b43deb..beb3c485 100644
--- a/Deep_learning/GCN-asm-tutorial.rst
+++ b/Deep_learning/GCN-asm-tutorial.rst
@@ -7,16 +7,16 @@ GCN asm Tutorial
 
 The Art of AMDGCN Assembly: How to Bend the Machine to Your Will
 ******************************************************************
-The ability to write code in assembly is essential to achieving the best performance for a GPU program. In a `previous blog <gpuopen.com/rocm-with-harmony-combining-opencl-hcc-hsa-in-a-single-program/>`_ we described how to combine several languages in a single program using ROCm and Hsaco. This article explains how to produce Hsaco from assembly code and also takes a closer look at some new features of the GCN architecture. I'd like to thank Ilya Perminov of Luxsoft for co-authoring this blog post. Programs written for GPUs should achieve the highest performance possible. Even carefully written ones, however, won’t always employ 100% of the GPU’s capabilities. Some reasons are the following:
+The ability to write code in assembly is essential to achieving the best performance for a GPU program. In a `previous blog <gpuopen.com/rocm-with-harmony-combining-opencl-hcc-hsa-in-a-single-program/>`_ we described how to combine several languages in a single program using ROCm and Hsaco. This article explains how to produce Hsaco from assembly code and also takes a closer look at some new features of the GCN architecture. I'd like to thank Ilya Perminov of Luxsoft for co-authoring this blog post. Programs written for GPUs should achieve the highest performance possible. Even carefully written ones, however, won't always employ 100% of the GPU's capabilities. Some reasons are the following:
 
  * The program may be written in a high level language that does not expose all of the features available on the hardware.
- * The compiler is unable to produce optimal ISA code, either because the compiler needs to ‘play it safe’ while adhering to the     	semantics of a language or because the compiler itself is generating un-optimized code.
+ * The compiler is unable to produce optimal ISA code, either because the compiler needs to 'play it safe' while adhering to the     	semantics of a language or because the compiler itself is generating un-optimized code.
 
-Consider a program that uses one of GCN’s new features (source code is available on `GitHub <https://github.com/RadeonOpenCompute/LLVM-AMDGPU-Assembler-Extra>`_). Recent hardware architecture updates—DPP and DS Permute instructions—enable efficient data sharing between wavefront lanes. To become more familiar with the instruction set, review the `GCN ISA Reference Guide <amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/07/AMD_GCN3_Instruction_Set_Architecture.pdf>`_. Note: the assembler is currently experimental; some of syntax we describe may change.
+Consider a program that uses one of GCN's new features (source code is available on `GitHub <https://github.com/RadeonOpenCompute/LLVM-AMDGPU-Assembler-Extra>`_). Recent hardware architecture updates--DPP and DS Permute instructions--enable efficient data sharing between wavefront lanes. To become more familiar with the instruction set, review the `GCN ISA Reference Guide <amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/07/AMD_GCN3_Instruction_Set_Architecture.pdf>`_. Note: the assembler is currently experimental; some of syntax we describe may change.
 
 DS Permute Instructions
 **************************
-Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to move between lanes on the basis of an index from another VGPR. These instructions use LDS hardware to route data between the 64 lanes, but they don’t write to LDS memory. The difference between them is what to index: the source-lane ID or the destination-lane ID. In other words, ds_permute_b32 says “put my lane data in lane i,” and ds_bpermute_b32 says “read data from lane i.” The GCN ISA Reference Guide provides a more formal description. The test kernel is simple: read the initial data and indices from memory into GPRs, do the permutation in the GPRs and write the data back to memory. An analogous OpenCL kernel would have this form:
+Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to move between lanes on the basis of an index from another VGPR. These instructions use LDS hardware to route data between the 64 lanes, but they don't write to LDS memory. The difference between them is what to index: the source-lane ID or the destination-lane ID. In other words, ds_permute_b32 says "put my lane data in lane i," and ds_bpermute_b32 says "read data from lane i." The GCN ISA Reference Guide provides a more formal description. The test kernel is simple: read the initial data and indices from memory into GPRs, do the permutation in the GPRs and write the data back to memory. An analogous OpenCL kernel would have this form:
 
 ::
 
@@ -28,7 +28,7 @@ Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to mov
 
 Passing Parameters to a Kernel
 *******************************
-Formal HSA arguments are passed to a kernel using a special read-only memory segment called kernarg. Before a wavefront starts, the base address of the kernarg segment is written to an SGPR pair. The memory layout of variables in kernarg must employ the same order as the list of kernel formal arguments, starting at offset 0, with no padding between variables—except to honor the requirements of natural alignment and any align qualifier. The example host program must create the kernarg segment and fill it with the buffer base addresses. The HSA host code might look like the following:
+Formal HSA arguments are passed to a kernel using a special read-only memory segment called kernarg. Before a wavefront starts, the base address of the kernarg segment is written to an SGPR pair. The memory layout of variables in kernarg must employ the same order as the list of kernel formal arguments, starting at offset 0, with no padding between variables--except to honor the requirements of natural alignment and any align qualifier. The example host program must create the kernarg segment and fill it with the buffer base addresses. The HSA host code might look like the following:
 
 ::
 
@@ -50,7 +50,7 @@ Formal HSA arguments are passed to a kernel using a special read-only memory seg
   aql->kernarg_address = args;
   /*
   * Write the args directly to the kernargs buffer;
-  * the code assumes that memory is already allocated for the 
+  * the code assumes that memory is already allocated for the
   * buffers that in_ptr, index_ptr and out_ptr point to
   */
   args->in = in_ptr;
@@ -71,9 +71,9 @@ The host program should also allocate memory for the in, index and out buffers.
   out = AllocateBuffer(size);
 
   // Fill Kernarg memory
-  Kernarg(in); // Add base pointer to “in” buffer
-  Kernarg(index); // Append base pointer to “index” buffer
-  Kernarg(out); // Append base pointer to “out” buffer
+  Kernarg(in); // Add base pointer to "in" buffer
+  Kernarg(index); // Append base pointer to "index" buffer
+  Kernarg(out); // Append base pointer to "out" buffer
 
 Initial Wavefront and Register State To launch a kernel in real hardware, the run time needs information about the kernel, such as
 
@@ -91,7 +91,7 @@ Initial Wavefront and Register State To launch a kernel in real hardware, the ru
    .text
    .p2align 8
    .amdgpu_hsa_kernel hello_world
- 
+
    hello_world:
 
    .amd_kernel_code_t
@@ -131,7 +131,7 @@ Currently, a programmer must manually set all non-default values to provide the
 
 The GPR Counting
 ******************
-The next amd_kernel_code_t fields are obvious: is_ptr64 = 1 says we are in 64-bit mode, and kernarg_segment_byte_size = 24 describes the kernarg segment size. The GPR counting is less straightforward, however. The workitem_vgpr_count holds the number of vector registers that each work item uses, and wavefront_sgpr_count holds the number of scalar registers that a wavefront uses. The code above employs v0–v4, so workitem_vgpr_count = 5. But wavefront_sgpr_count = 8 even though the code only shows s0–s5, since the special registers VCC, FLAT_SCRATCH and XNACK are physically stored as part of the wavefront’s SGPRs in the highest-numbered SGPRs. In this example, FLAT_SCRATCH and XNACK are disabled, so VCC has only two additional registers. In current GCN3 hardware, VGPRs are allocated in groups of 4 registers and SGPRs in groups of 16. Previous generations (GCN1 and GCN2) have a VGPR granularity of 4 registers and an SGPR granularity of 8 registers. The fields compute_pgm_rsrc1_*gprs contain a device-specific number for each register-block type to allocate for a wavefront. As we said previously, future updates may enable automatic counting, but for now you can use following formulas for all three GCN GPU generations:
+The next amd_kernel_code_t fields are obvious: is_ptr64 = 1 says we are in 64-bit mode, and kernarg_segment_byte_size = 24 describes the kernarg segment size. The GPR counting is less straightforward, however. The workitem_vgpr_count holds the number of vector registers that each work item uses, and wavefront_sgpr_count holds the number of scalar registers that a wavefront uses. The code above employs v0-v4, so workitem_vgpr_count = 5. But wavefront_sgpr_count = 8 even though the code only shows s0-s5, since the special registers VCC, FLAT_SCRATCH and XNACK are physically stored as part of the wavefront's SGPRs in the highest-numbered SGPRs. In this example, FLAT_SCRATCH and XNACK are disabled, so VCC has only two additional registers. In current GCN3 hardware, VGPRs are allocated in groups of 4 registers and SGPRs in groups of 16. Previous generations (GCN1 and GCN2) have a VGPR granularity of 4 registers and an SGPR granularity of 8 registers. The fields compute_pgm_rsrc1_*gprs contain a device-specific number for each register-block type to allocate for a wavefront. As we said previously, future updates may enable automatic counting, but for now you can use following formulas for all three GCN GPU generations:
 
 ::
 
diff --git a/Deep_learning/MXNet.rst b/Deep_learning/MXNet.rst
index 82d880a9..a8187bd2 100644
--- a/Deep_learning/MXNet.rst
+++ b/Deep_learning/MXNet.rst
@@ -1,7 +1,7 @@
 .. _mxnet:
 
 =========
-MXNet 
+MXNet
 =========
 
 .. image:: MXNet_image1.png
@@ -28,11 +28,11 @@ Prerequisites
 * Install ROCm Libraries
 
  ::
-  
+
   sudo apt install -y rocm-device-libs rocm-libs rocblas hipblas rocrand rocfft
 
 * Install ROCm opencl
- 
+
  ::
 
   sudo apt install -y rocm-opencl rocm-opencl-dev
@@ -48,16 +48,16 @@ Prerequisites
  ::
 
   sudo apt install -y rocthrust rocprim hipcub
- 
- 
+
+
 **Install Dependencies to build mxnet for HIP/CUDA**
 
-Install CUDA following the NVIDIA’s `installation guide <http://docs.nvidia.com/cuda/cuda-installation-guide-linux/>`_ to setup MXNet with GPU support
+Install CUDA following the NVIDIA's `installation guide <http://docs.nvidia.com/cuda/cuda-installation-guide-linux/>`_ to setup MXNet with GPU support
 
-.. note:: 
-   * Make sure to add CUDA install path to LD_LIBRARY_PATH 
+.. note::
+   * Make sure to add CUDA install path to LD_LIBRARY_PATH
    * Example - export LD_LIBRARY_PATH=/usr/local/cuda/lib64/:$LD_LIBRARY_PATH
-   
+
 Install the dependencies hipblas, rocrand, hcfft from source.
 
 Build the MXNet library
@@ -66,9 +66,9 @@ Build the MXNet library
 **Step 1: Install build tools.**
 ::
  $ sudo apt-get update
- $ sudo apt-get install -y build-essential 
- 
-**Step 2: Install OpenBLAS.** 
+ $ sudo apt-get install -y build-essential
+
+**Step 2: Install OpenBLAS.**
 MXNet uses BLAS and LAPACK libraries for accelerated numerical computations on CPU machine. There are several flavors of BLAS/LAPACK libraries - OpenBLAS, ATLAS and MKL. In this step we install OpenBLAS. You can choose to install ATLAS or MKL.
 ::
  $ sudo apt-get install -y libopenblas-dev liblapack-dev libomp-dev libatlas-dev libatlas-base-dev
@@ -78,9 +78,9 @@ Install OpenCV <https://opencv.org/>`_ here.
 MXNet uses OpenCV for efficient image loading and augmentation operations.
 ::
  $ sudo apt-get install -y libopencv-dev
- 
 
- 
+
+
 **Step 4: Download MXNet sources and build MXNet core shared library.**
 ::
  $ git clone --recursive https://github.com/ROCmSoftwarePlatform/mxnet.git
@@ -96,25 +96,25 @@ MXNet uses OpenCV for efficient image loading and augmentation operations.
 **To compile on NVCC PLATFORM(HIP/CUDA):**
 ::
  $ export HIP_PLATFORM=nvcc
- 
 
- 
+
+
 **Step 6: To enable MIOpen for higher acceleration :**
 ::
- USE_CUDNN=1  
- 
+ USE_CUDNN=1
+
 
 **Step 7:**
 **If building on CPU:**
 ::
  make -jn(n=number of cores) USE_GPU=0 (For Ubuntu 16.04)
  make -jn(n=number of cores)  CXX=g++-6 USE_GPU=0 (For Ubuntu 18.04)
- 
+
 **If building on GPU:**
 ::
  make -jn(n=number of cores) USE_GPU=1 (For Ubuntu 16.04)
- make -jn(n=number of cores)  CXX=g++-6 USE_GPU=1 (For Ubuntu 18.04) 
- 
+ make -jn(n=number of cores)  CXX=g++-6 USE_GPU=1 (For Ubuntu 18.04)
+
 
 On succesfull compilation a library called libmxnet.so is created in mxnet/lib path.
 
@@ -137,7 +137,7 @@ Install the MXNet Python binding
 **Step 2: Install the MXNet Python binding.**
 ::
  $ cd python
- $ sudo python setup.py install 
+ $ sudo python setup.py install
 
 **Step 3: Execute sample example**
 ::
diff --git a/Deep_learning/caffe.rst b/Deep_learning/caffe.rst
index 3f0da7d8..b39a379a 100644
--- a/Deep_learning/caffe.rst
+++ b/Deep_learning/caffe.rst
@@ -30,38 +30,38 @@ Installing ROCm Debian packages:
 ::
 
   PKG_REPO="http://repo.radeon.com/rocm/apt/debian/"
-   
+
   wget -qO - $PKG_REPO/rocm.gpg.key | sudo apt-key add -
-  
+
   sudo sh -c "echo deb [arch=amd64] $PKG_REPO xenial main > /etc/apt/sources.list.d/rocm.list"
- 
+
   sudo apt-get update
-  
+
   sudo apt-get install rocm rocm-utils rocm-opencl rocm-opencl-dev rocm-profiler cxlactivitylogger
 
   echo 'export PATH=/opt/rocm/bin:$PATH' >> $HOME/.bashrc
-  
+
   echo 'export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH' >> $HOME/.bashrc
 
   source $HOME/.bashrc
-  
+
   sudo reboot
-  
+
 Then, verify the installation. Double-check your kernel (at a minimum, you should see "kfd" in the name)::
- 
+
    uname -r
-  
+
 In addition, check that you can run the simple HSA vector_copy sample application::
-  
+
   cd /opt/rocm/hsa/sample
   make
   ./vector_copy
-  
+
 Pre-requisites Installation
 ++++++++++++++++++++++++++++
 
 Install Caffe dependencies::
- 
+
  sudo apt-get install \
  	pkg-config \
  	protobuf-compiler \
@@ -78,24 +78,24 @@ Install Caffe dependencies::
  	libopencv-dev \
  	libfftw3-dev \
  	libelf-dev
- 
+
 
 Install the necessary ROCm compute libraries::
- 
+
  sudo apt-get install rocm-libs miopen-hip miopengemm
 
 hipCaffe Build Steps
 +++++++++++++++++++++
 Clone hipCaffe::
- 
- git clone https://github.com/ROCmSoftwarePlatform/hipCaffe.git 
- 
+
+ git clone https://github.com/ROCmSoftwarePlatform/hipCaffe.git
+
  cd hipCaffe
- 
+
 You may need to modify the Makefile.config file for your own installation. Then, build it::
- 
+
  cp ./Makefile.config.example ./Makefile.config
- make 
+ make
 
 To improve build time, consider invoking parallel make with the "-j$(nproc)" flag.
 
@@ -103,7 +103,7 @@ Unit Testing
 -------------
 
 Run the following commands to perform unit testing of different components of Caffe.
-:: 
+::
  make test
  ./build/test/test_all.testbin
 
@@ -114,7 +114,7 @@ MNIST training
 ++++++++++++++++
 
 Steps::
- 
+
     ./data/mnist/get_mnist.sh
     ./examples/mnist/create_mnist.sh
     ./examples/mnist/train_lenet.sh
@@ -123,7 +123,7 @@ CIFAR-10 training
 ++++++++++++++++++
 
 Steps::
- 
+
     ./data/cifar10/get_cifar10.sh
     ./examples/cifar10/create_cifar10.sh
     ./build/tools/caffe train --solver=examples/cifar10/cifar10_quick_solver.prototxt
@@ -142,7 +142,7 @@ Soumith's Convnet benchmarks
 Steps:
 
 ::
-  
+
   git clone https://github.com/soumith/convnet-benchmarks.git
   cd convnet-benchmarks/caffe
 
@@ -183,7 +183,7 @@ Sometimes when training with multiple GPUs, we hit this type of error signature:
      @           0x8015c3 caffe::Solver<>::Solve()
      @           0x71a277 caffe::P2PSync<>::Run()
      @           0x42dcbc train()
- 
+
 
 See this `comment <https://github.com/ROCmSoftwarePlatform/hipCaffe/issues/11#issuecomment-318518802>`_.
 
diff --git a/Deep_learning/hipCaffe .rst b/Deep_learning/hipCaffe .rst
index 1d4ae7a8..051b9fb5 100644
--- a/Deep_learning/hipCaffe .rst	
+++ b/Deep_learning/hipCaffe .rst	
@@ -4,18 +4,18 @@
 hipCaffe Quickstart Guide
 ###########################
 
-In this quickstart guide, we’ll walk through the steps for ROCm installation. Then, we’ll run a few training and inference experiments and check their accuracy.
+In this quickstart guide, we'll walk through the steps for ROCm installation. Then, we'll run a few training and inference experiments and check their accuracy.
 
 Install ROCm
 -------------
-Here are the main ROCm components we’ll be using::
+Here are the main ROCm components we'll be using::
 
  sudo apt-get install rocm
  sudo apt-get install rocm-libs
  sudo apt-get install miopen-hip miopengemm
- 
+
 And some misc packages::
- 
+
  sudo apt-get install -y \
       g++-multilib \
       libunwind-dev \
@@ -28,65 +28,65 @@ And some misc packages::
       rpm \
       unzip \
       bc
- 
+
 Verify ROCm
 ------------
 Test a simple HIP sample::
- 
+
  cp -r /opt/rocm/hip/samples ~/hip-samples && cd ~/hip-samples/0_Intro/square/
- 
+
  make
- 
+
  ./square.hip.out
-  
+
 Install hipCaffe
 ----------------
 Handle the Caffe dependencies first::
- 
+
  sudo apt-get install -y \
       pkg-config \
       protobuf-compiler \
       libprotobuf-dev \
       libleveldb-dev \
       libsnappy-dev \
-      libhdf5-serial-dev \ 
+      libhdf5-serial-dev \
       libatlas-base-dev \
       libboost-all-dev \
       libgflags-dev \
       libgoogle-glog-dev \
-      liblmdb-dev \ 
+      liblmdb-dev \
       python-numpy python-scipy python3-dev python-yaml python-pip \
       python-skimage python-opencv python-protobuf \
       libopencv-dev \
       libfftw3-dev \
       libelf-dev
- 
+
 Note that you might need minor changes to Makefile.config (system dependent)::
- 
+
  cd ~
- 
+
  git clone https://github.com/ROCmSoftwarePlatform/hipCaffe.git
- 
+
  cd hipCaffe
- 
+
  cp ./Makefile.config.example ./Makefile.config
- 
+
  make -j$(nproc)
- 
+
 
 Workloads
 -----------
 MNIST training
 +++++++++++++++
 
-Details on MNIST training can be found at this `link <https://github.com/BVLC/caffe/blob/master/examples/mnist/readme.md>`_. 
- 
+Details on MNIST training can be found at this `link <https://github.com/BVLC/caffe/blob/master/examples/mnist/readme.md>`_.
+
 Here are the basic instructions::
 
  ./data/mnist/get_mnist.sh
  ./examples/mnist/create_mnist.sh
  ./examples/mnist/train_lenet.sh
- 
+
 Expected result: >99% accuracy after 10000 iterations
 ::
 
@@ -104,7 +104,7 @@ Expected result: >99% accuracy after 10000 iterations
  I0717 21:06:58.701591  9965 solver.cpp:404]     Test net output #0: accuracy = 0.9917
  I0717 21:06:58.701642  9965 solver.cpp:404]     Test net output #1: loss = 0.0269806 (* 1 = 0.0269806 loss)
  I0717 21:06:58.701668  9965 solver.cpp:322] Optimization Done.
-  
+
 
 CIFAR-10 training
 ++++++++++++++++++
@@ -112,14 +112,14 @@ CIFAR-10 training
 Details on CIFAR-10 training can be found at this `link <https://github.com/BVLC/caffe/blob/master/examples/cifar10/readme.md>`_.
 
 Here are the basic instructions::
- 
+
  ./data/cifar10/get_cifar10.sh
  ./examples/cifar10/create_cifar10.sh
  ./build/tools/caffe train --solver=examples/cifar10/cifar10_quick_solver.prototxt
- 
+
 Expected result: >70% accuracy after 4000 iterations
 ::
- 
+
  I0727 18:29:35.248363    33 solver.cpp:279] Solving CIFAR10_quick
  I0727 18:29:35.248366    33 solver.cpp:280] Learning Rate Policy: fixed
  I0727 18:29:35.248883    33 solver.cpp:337] Iteration 0, Testing net (#0)
@@ -134,7 +134,7 @@ Expected result: >70% accuracy after 4000 iterations
  I0727 18:30:13.722070    33 solver.cpp:404]     Test net output #0: accuracy = 0.7124
  I0727 18:30:13.722090    33 solver.cpp:404]     Test net output #1: loss = 0.848089 (* 1 = 0.848089 loss)
  I0727 18:30:13.722095    33 solver.cpp:322] Optimization Done.
- 
+
 
 CaffeNet inference
 +++++++++++++++++++
@@ -142,20 +142,20 @@ CaffeNet inference
 Details on CaffeNet inference can be found at this `link <https://github.com/BVLC/caffe/blob/master/examples/cpp_classification/readme.md>`_.
 
 Here are the basic instructions::
- 
+
  ./data/ilsvrc12/get_ilsvrc_aux.sh
  ./scripts/download_model_binary.py models/bvlc_reference_caffenet
  ./build/examples/cpp_classification/classification.bin models/bvlc_reference_caffenet/deploy.prototxt models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel data/ilsvrc12/imagenet_mean.binaryproto data/ilsvrc12/synset_words.txt examples/images/cat.jpg
- 
+
 
 Expected result: (note the ordering and associated percentages)
 ::
- 
+
  ---------- Prediction for examples/images/cat.jpg ----------
  0.3134 - "n02123045 tabby, tabby cat"
  0.2380 - "n02123159 tiger cat"
  0.1235 - "n02124075 Egyptian cat"
  0.1003 - "n02119022 red fox, Vulpes vulpes"
  0.0715 - "n02127052 lynx, catamount"
- 
+
 
diff --git a/Doxyfile b/Doxyfile
index ea5b10a3..0a743e43 100644
--- a/Doxyfile
+++ b/Doxyfile
@@ -802,7 +802,7 @@ FILE_PATTERNS          = *.h *.cpp
 # be searched for input files as well.
 # The default value is: NO.
 
-#YES -> NO for rocblas_handle to come along with Enums 
+#YES -> NO for rocblas_handle to come along with Enums
 RECURSIVE              = NO
 
 # The EXCLUDE tag can be used to specify files and/or directories that should be
diff --git a/FAQ/FAQ_HIP.rst b/FAQ/FAQ_HIP.rst
index 15bfceb6..a911c63c 100644
--- a/FAQ/FAQ_HIP.rst
+++ b/FAQ/FAQ_HIP.rst
@@ -31,7 +31,7 @@ Runtime/Driver API features
 
 At a high*level, the following features are not supported:
 
-* Textures 
+* Textures
 * Dynamic parallelism (CUDA 5.0)
 * Managed memory (CUDA 6.5)
 * Graphics interoperability with OpenGL or Direct3D
@@ -48,9 +48,9 @@ Kernel language features
 
 * Device*side dynamic memory allocations (malloc, free, new, delete) (CUDA 4.0)
 * Virtual functions, indirect functions and try/catch (CUDA 4.0)
-* `__prof_trigger` 
+* `__prof_trigger`
 * PTX assembly (CUDA 4.0).  HCC supports inline GCN assembly.
-* Several kernel features are under development.  See the `HIP Kernel Language <hip_kernel_language.md>`_ for more information.  
+* Several kernel features are under development.  See the `HIP Kernel Language <hip_kernel_language.md>`_ for more information.
 
 These include
 
@@ -66,23 +66,23 @@ Is HIP a drop*in replacement for CUDA?
 ******************************
 
 No. HIP provides porting tools which do most of the work to convert CUDA code into portable C++ code that uses the HIP APIs.
-Most developers will port their code from CUDA to HIP and then maintain the HIP version. 
+Most developers will port their code from CUDA to HIP and then maintain the HIP version.
 HIP code provides the same performance as native CUDA code, plus the benefits of running on AMD platforms.
 
 What specific version of CUDA does HIP support?
 *************************************
 
-HIP APIs and features do not map to a specific CUDA version. HIP provides a strong subset of functionality provided in CUDA, and the hipify tools can 
+HIP APIs and features do not map to a specific CUDA version. HIP provides a strong subset of functionality provided in CUDA, and the hipify tools can
 scan code to identify any unsupported CUDA functions * this is useful for identifying the specific features required by a given application.
 
 However, we can provide a rough summary of the features included in each CUDA SDK and the support level in HIP:
 
-* CUDA 4.0 and earlier :  
+* CUDA 4.0 and earlier :
     * HIP supports CUDA 4.0 except for the limitations described above.
-* CUDA 5.0 : 
-    * Dynamic Parallelism (not supported) 
+* CUDA 5.0 :
+    * Dynamic Parallelism (not supported)
     * cuIpc functions (under development).
-* CUDA 5.5 : 
+* CUDA 5.5 :
     * CUPTI (not directly supported), `AMD GPUPerfAPI <http://developer.amd.com/tools*and*sdks/graphics*development/gpuperfapi/>`_ can be used as an alternative in some cases)
 * CUDA 6.0
     * Managed memory (under development)
@@ -100,15 +100,15 @@ What libraries does HIP support?
 *****************************
 
 HIP includes growing support for the 4 key math libraries using hcBlas, hcFft, hcrng and hcsparse.
-These offer pointer*based memory interfaces (as opposed to opaque buffers) and can be easily interfaced with other HCC applications.  Developers should use conditional compilation if portability to nvcc systems is desired * using calls to cu* routines on one path and hc* routines on the other.  
+These offer pointer*based memory interfaces (as opposed to opaque buffers) and can be easily interfaced with other HCC applications.  Developers should use conditional compilation if portability to nvcc systems is desired * using calls to cu* routines on one path and hc* routines on the other.
 
 * `rocblas <https://github.com/ROCmSoftwarePlatform/rocBLAS>`_
 * `rocfft <https://github.com/ROCmSoftwarePlatform/rocFFT>`_
 * `MIOpen <https://github.com/ROCmSoftwarePlatform/MIOpen>`_
-* hipRAND Under Development 
-   
+* hipRAND Under Development
+
 Additionally, some of the cublas routines are automatically converted to hipblas equivalents by the hipify*clang tool.  These APIs use cublas or hcblas depending on the platform, and replace the need
-to use conditional compilation. 
+to use conditional compilation.
 
 How does HIP compare with OpenCL?
 *****************************
@@ -137,10 +137,10 @@ HIP and CUDA provide similar math library calls as well.  In summary, the HIP ph
 This reduces the potential for error, and also makes it easy to automate the translation.  HIP's goal is to quickly get the ported program running on both platforms with little manual intervention,
 so that the programmer can focus on performance optimizations.
 
-There have been several tools that have attempted to convert CUDA into OpenCL, such as CU2CL.  OpenCL is a C99*based kernel language (rather than C++) and also does not support single*source compilation.  
+There have been several tools that have attempted to convert CUDA into OpenCL, such as CU2CL.  OpenCL is a C99*based kernel language (rather than C++) and also does not support single*source compilation.
 As a result, the OpenCL syntax is different from CUDA, and the porting tools have to perform some heroic transformations to bridge this gap.
 
-The tools also struggle with more complex CUDA applications, in particular those that use templates, classes, or other C++ features inside the kernel.  
+The tools also struggle with more complex CUDA applications, in particular those that use templates, classes, or other C++ features inside the kernel.
 
 
 What hardware does HIP support?
@@ -152,12 +152,12 @@ What hardware does HIP support?
 Does Hipify automatically convert all source code?
 *****************************
 
-Typically, hipify can automatically convert almost all run*time code, and the coordinate indexing device code ( threadIdx.x *> hipThreadIdx_x ).  
+Typically, hipify can automatically convert almost all run*time code, and the coordinate indexing device code ( threadIdx.x *> hipThreadIdx_x ).
 
-Most device code needs no additional conversion, since HIP and CUDA have similar names for math and built*in functions. 
+Most device code needs no additional conversion, since HIP and CUDA have similar names for math and built*in functions.
 The hipify*clang tool will automatically modify the kernel signature as needed (automating a step that used to be done manually)
 
-Additional porting may be required to deal with architecture feature queries or with CUDA capabilities that HIP doesn't support. 
+Additional porting may be required to deal with architecture feature queries or with CUDA capabilities that HIP doesn't support.
 
 In general, developers should always expect to perform some platform*specific tuning and optimization.
 
@@ -175,8 +175,8 @@ Why use HIP rather than supporting CUDA directly?
 *****************************
 While HIP is a strong subset of the CUDA, it is a subset.  The HIP layer allows that subset to be clearly defined and documented.
 
-Developers who code to the HIP API can be assured their code will remain portable across Nvidia and AMD platforms.  
-In addition, HIP defines portable mechanisms to query architectural features, and supports a larger 64*bit wavesize which expands the return type for cross*lane functions like ballot and shuffle from 32*bit ints to 64*bit ints.  
+Developers who code to the HIP API can be assured their code will remain portable across Nvidia and AMD platforms.
+In addition, HIP defines portable mechanisms to query architectural features, and supports a larger 64*bit wavesize which expands the return type for cross*lane functions like ballot and shuffle from 32*bit ints to 64*bit ints.
 
 Can I develop HIP code on an Nvidia CUDA platform?
 *****************************
@@ -192,7 +192,7 @@ In some cases CUDA has a richer set of modes for some APIs, and some C++ capabil
 Can I develop HIP code on an AMD HCC platform?
 *****************************
 
-Yes. HIP's HCC path only exposes the APIs and functions that work on both NVCC and HCC back ends. "Extra" APIs, parameters and features that appear in HCC but not CUDA will typically cause compile* or run*time errors. Developers must use the HIP API for most accelerator code and bracket any HCC*specific code with preprocessor conditionals. 
+Yes. HIP's HCC path only exposes the APIs and functions that work on both NVCC and HCC back ends. "Extra" APIs, parameters and features that appear in HCC but not CUDA will typically cause compile* or run*time errors. Developers must use the HIP API for most accelerator code and bracket any HCC*specific code with preprocessor conditionals.
 
 Those concerned about portability should, of course, test their code on both platforms and should tune it for performance. Typically, HCC supports a more modern set of C++11/C++14/C++17 features, so HIP developers who want portability should be careful when using advanced C++ features on the hc path.
 
diff --git a/GCN_ISA_Manuals/GCN-ISA-Manuals.rst b/GCN_ISA_Manuals/GCN-ISA-Manuals.rst
index 55aedd3d..a72a38de 100644
--- a/GCN_ISA_Manuals/GCN-ISA-Manuals.rst
+++ b/GCN_ISA_Manuals/GCN-ISA-Manuals.rst
@@ -26,16 +26,16 @@ Inline GCN ISA Assembly Guide
 
 The Art of AMDGCN Assembly: How to Bend the Machine to Your Will
 ******************************************************************
-The ability to write code in assembly is essential to achieving the best performance for a GPU program. In a `previous blog <https://gpuopen.com/rocm-with-harmony-combining-opencl-hcc-hsa-in-a-single-program/>`_ we described how to combine several languages in a single program using ROCm and Hsaco. This article explains how to produce Hsaco from assembly code and also takes a closer look at some new features of the GCN architecture. I'd like to thank Ilya Perminov of Luxsoft for co-authoring this blog post. Programs written for GPUs should achieve the highest performance possible. Even carefully written ones, however, won’t always employ 100% of the GPU’s capabilities. Some reasons are the following:
+The ability to write code in assembly is essential to achieving the best performance for a GPU program. In a `previous blog <https://gpuopen.com/rocm-with-harmony-combining-opencl-hcc-hsa-in-a-single-program/>`_ we described how to combine several languages in a single program using ROCm and Hsaco. This article explains how to produce Hsaco from assembly code and also takes a closer look at some new features of the GCN architecture. I'd like to thank Ilya Perminov of Luxsoft for co-authoring this blog post. Programs written for GPUs should achieve the highest performance possible. Even carefully written ones, however, won't always employ 100% of the GPU's capabilities. Some reasons are the following:
 
  * The program may be written in a high level language that does not expose all of the features available on the hardware.
- * The compiler is unable to produce optimal ISA code, either because the compiler needs to ‘play it safe’ while adhering to the     	semantics of a language or because the compiler itself is generating un-optimized code.
+ * The compiler is unable to produce optimal ISA code, either because the compiler needs to 'play it safe' while adhering to the     	semantics of a language or because the compiler itself is generating un-optimized code.
 
-Consider a program that uses one of GCN’s new features (source code is available on `GitHub <https://github.com/RadeonOpenCompute/LLVM-AMDGPU-Assembler-Extra>`_). Recent hardware architecture updates—DPP and DS Permute instructions—enable efficient data sharing between wavefront lanes. To become more familiar with the instruction set, review the `GCN ISA Reference Guide <https://github.com/olvaffe/gpu-docs/blob/master/amd-open-gpu-docs/AMD_GCN3_Instruction_Set_Architecture.pdf>`_. Note: the assembler is currently experimental; some of syntax we describe may change.
+Consider a program that uses one of GCN's new features (source code is available on `GitHub <https://github.com/RadeonOpenCompute/LLVM-AMDGPU-Assembler-Extra>`_). Recent hardware architecture updates--DPP and DS Permute instructions--enable efficient data sharing between wavefront lanes. To become more familiar with the instruction set, review the `GCN ISA Reference Guide <https://github.com/olvaffe/gpu-docs/blob/master/amd-open-gpu-docs/AMD_GCN3_Instruction_Set_Architecture.pdf>`_. Note: the assembler is currently experimental; some of syntax we describe may change.
 
 DS Permute Instructions
 **************************
-Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to move between lanes on the basis of an index from another VGPR. These instructions use LDS hardware to route data between the 64 lanes, but they don’t write to LDS memory. The difference between them is what to index: the source-lane ID or the destination-lane ID. In other words, ds_permute_b32 says “put my lane data in lane i,” and ds_bpermute_b32 says “read data from lane i.” The GCN ISA Reference Guide provides a more formal description. The test kernel is simple: read the initial data and indices from memory into GPRs, do the permutation in the GPRs and write the data back to memory. An analogous OpenCL kernel would have this form:
+Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to move between lanes on the basis of an index from another VGPR. These instructions use LDS hardware to route data between the 64 lanes, but they don't write to LDS memory. The difference between them is what to index: the source-lane ID or the destination-lane ID. In other words, ds_permute_b32 says "put my lane data in lane i," and ds_bpermute_b32 says "read data from lane i." The GCN ISA Reference Guide provides a more formal description. The test kernel is simple: read the initial data and indices from memory into GPRs, do the permutation in the GPRs and write the data back to memory. An analogous OpenCL kernel would have this form:
 
 ::
 
@@ -47,7 +47,7 @@ Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to mov
 
 Passing Parameters to a Kernel
 *******************************
-Formal HSA arguments are passed to a kernel using a special read-only memory segment called kernarg. Before a wavefront starts, the base address of the kernarg segment is written to an SGPR pair. The memory layout of variables in kernarg must employ the same order as the list of kernel formal arguments, starting at offset 0, with no padding between variables—except to honor the requirements of natural alignment and any align qualifier. The example host program must create the kernarg segment and fill it with the buffer base addresses. The HSA host code might look like the following:
+Formal HSA arguments are passed to a kernel using a special read-only memory segment called kernarg. Before a wavefront starts, the base address of the kernarg segment is written to an SGPR pair. The memory layout of variables in kernarg must employ the same order as the list of kernel formal arguments, starting at offset 0, with no padding between variables--except to honor the requirements of natural alignment and any align qualifier. The example host program must create the kernarg segment and fill it with the buffer base addresses. The HSA host code might look like the following:
 
 ::
 
@@ -69,7 +69,7 @@ Formal HSA arguments are passed to a kernel using a special read-only memory seg
   aql->kernarg_address = args;
   /*
   * Write the args directly to the kernargs buffer;
-  * the code assumes that memory is already allocated for the 
+  * the code assumes that memory is already allocated for the
   * buffers that in_ptr, index_ptr and out_ptr point to
   */
   args->in = in_ptr;
@@ -90,9 +90,9 @@ The host program should also allocate memory for the in, index and out buffers.
   out = AllocateBuffer(size);
 
   // Fill Kernarg memory
-  Kernarg(in); // Add base pointer to “in” buffer
-  Kernarg(index); // Append base pointer to “index” buffer
-  Kernarg(out); // Append base pointer to “out” buffer
+  Kernarg(in); // Add base pointer to "in" buffer
+  Kernarg(index); // Append base pointer to "index" buffer
+  Kernarg(out); // Append base pointer to "out" buffer
 
 Initial Wavefront and Register State To launch a kernel in real hardware, the run time needs information about the kernel, such as
 
@@ -110,7 +110,7 @@ Initial Wavefront and Register State To launch a kernel in real hardware, the ru
    .text
    .p2align 8
    .amdgpu_hsa_kernel hello_world
- 
+
    hello_world:
 
    .amd_kernel_code_t
@@ -146,13 +146,13 @@ Initial Wavefront and Register State To launch a kernel in real hardware, the ru
    flat_store_dword  v[3:4], v1
    s_endpgm
 
-Currently, a programmer must manually set all non-default values to provide the necessary information. Hopefully, this situation will change with new updates that bring automatic register counting and possibly a new syntax to fill that structure. Before the start of every wavefront execution, the GPU sets up the register state on the basis of the enable_sgpr_* and enable_vgpr_* flags. VGPR v0 is always initialized with a work-item ID in the x dimension. Registers v1 and v2 can be initialized with work-item IDs in the y and z dimensions, respectively. Scalar GPRs can be initialized with a work-group ID and work-group count in each dimension, a dispatch ID, and pointers to kernarg, the aql packet, the aql queue, and so on. Again, the AMDGPU-ABI specification contains a full list in in the section on initial register state. For this example, a 64-bit base kernarg address will be stored in the s[0:1] registers (enable_sgpr_kernarg_segment_ptr = 1), and the work-item thread ID will occupy v0 (by default). Below is the scheme showing initial state for our kernel. 
+Currently, a programmer must manually set all non-default values to provide the necessary information. Hopefully, this situation will change with new updates that bring automatic register counting and possibly a new syntax to fill that structure. Before the start of every wavefront execution, the GPU sets up the register state on the basis of the enable_sgpr_* and enable_vgpr_* flags. VGPR v0 is always initialized with a work-item ID in the x dimension. Registers v1 and v2 can be initialized with work-item IDs in the y and z dimensions, respectively. Scalar GPRs can be initialized with a work-group ID and work-group count in each dimension, a dispatch ID, and pointers to kernarg, the aql packet, the aql queue, and so on. Again, the AMDGPU-ABI specification contains a full list in in the section on initial register state. For this example, a 64-bit base kernarg address will be stored in the s[0:1] registers (enable_sgpr_kernarg_segment_ptr = 1), and the work-item thread ID will occupy v0 (by default). Below is the scheme showing initial state for our kernel.
 
 .. image:: initial_state-768x387.png
 
 The GPR Counting
 ******************
-The next amd_kernel_code_t fields are obvious: is_ptr64 = 1 says we are in 64-bit mode, and kernarg_segment_byte_size = 24 describes the kernarg segment size. The GPR counting is less straightforward, however. The workitem_vgpr_count holds the number of vector registers that each work item uses, and wavefront_sgpr_count holds the number of scalar registers that a wavefront uses. The code above employs v0–v4, so workitem_vgpr_count = 5. But wavefront_sgpr_count = 8 even though the code only shows s0–s5, since the special registers VCC, FLAT_SCRATCH and XNACK are physically stored as part of the wavefront’s SGPRs in the highest-numbered SGPRs. In this example, FLAT_SCRATCH and XNACK are disabled, so VCC has only two additional registers. In current GCN3 hardware, VGPRs are allocated in groups of 4 registers and SGPRs in groups of 16. Previous generations (GCN1 and GCN2) have a VGPR granularity of 4 registers and an SGPR granularity of 8 registers. The fields compute_pgm_rsrc1_*gprs contain a device-specific number for each register-block type to allocate for a wavefront. As we said previously, future updates may enable automatic counting, but for now you can use following formulas for all three GCN GPU generations:
+The next amd_kernel_code_t fields are obvious: is_ptr64 = 1 says we are in 64-bit mode, and kernarg_segment_byte_size = 24 describes the kernarg segment size. The GPR counting is less straightforward, however. The workitem_vgpr_count holds the number of vector registers that each work item uses, and wavefront_sgpr_count holds the number of scalar registers that a wavefront uses. The code above employs v0-v4, so workitem_vgpr_count = 5. But wavefront_sgpr_count = 8 even though the code only shows s0-s5, since the special registers VCC, FLAT_SCRATCH and XNACK are physically stored as part of the wavefront's SGPRs in the highest-numbered SGPRs. In this example, FLAT_SCRATCH and XNACK are disabled, so VCC has only two additional registers. In current GCN3 hardware, VGPRs are allocated in groups of 4 registers and SGPRs in groups of 16. Previous generations (GCN1 and GCN2) have a VGPR granularity of 4 registers and an SGPR granularity of 8 registers. The fields compute_pgm_rsrc1_*gprs contain a device-specific number for each register-block type to allocate for a wavefront. As we said previously, future updates may enable automatic counting, but for now you can use following formulas for all three GCN GPU generations:
 
 ::
 
diff --git a/GCN_ISA_Manuals/PCIe-features.rst b/GCN_ISA_Manuals/PCIe-features.rst
index 09440f69..935e77db 100644
--- a/GCN_ISA_Manuals/PCIe-features.rst
+++ b/GCN_ISA_Manuals/PCIe-features.rst
@@ -17,15 +17,15 @@ The new PCIe AtomicOps operate as completers for CAS(Compare and Swap), FetchADD
 
 Currently ROCm use this capability as following:
 
-* Update HSA queue’s read_dispatch_id: 64bit atomic add used by the command processor on the GPU agent to update the packet ID it processed.
-* Update HSA queue’s write_dispatch_id: 64bit atomic add used by the CPU and GPU agent to support multi-writer queue insertions.
-* Update HSA Signals – 64bit atomic ops are used for CPU & GPU synchronization.
+* Update HSA queue's read_dispatch_id: 64bit atomic add used by the command processor on the GPU agent to update the packet ID it processed.
+* Update HSA queue's write_dispatch_id: 64bit atomic add used by the CPU and GPU agent to support multi-writer queue insertions.
+* Update HSA Signals - 64bit atomic ops are used for CPU & GPU synchronization.
 
 The PCIe 3.0 AtomicOp feature allows atomic transactions to be requested by, routed through and completed by PCIe components. Routing and completion does not require software support. Component support for each is detectable via the DEVCAP2 register. Upstream bridges need to have AtomicOp routing enabled or the Atomic Operations will fall even though PCIe endpoint and PCIe I/O Devices has the capability to Atomics Operations.
 
 To do AtomicOp routing capability between two or more Root Ports, each associated Root Port must indicate that capability via the AtomicOp Routing Supported bit in the Device Capabilities 2 register.
 
-If your system has a PCIe Express Switch it needs to support AtomicsOp routing. Again AtomicOp requests are permitted only if a component’s DEVCTL2.ATOMICOP_REQUESTER_ENABLE field is set. These requests can only be serviced if the upstream components support AtomicOp completion and/or routing to a component which does. AtomicOp Routing Support=1 Routing is supported, AtomicOp Routing Support=0 routing is not supported.
+If your system has a PCIe Express Switch it needs to support AtomicsOp routing. Again AtomicOp requests are permitted only if a component's DEVCTL2.ATOMICOP_REQUESTER_ENABLE field is set. These requests can only be serviced if the upstream components support AtomicOp completion and/or routing to a component which does. AtomicOp Routing Support=1 Routing is supported, AtomicOp Routing Support=0 routing is not supported.
 
 Atomic Operation is a Non-Posted transaction supporting 32- and 64-bit address formats, there must be a response for Completion containing the result of the operation. Errors associated with the operation (uncorrectable error accessing the target location or carrying out the Atomic operation) are signaled to the requester by setting the Completion Status field in the completion descriptor, they are set to to Completer Abort (CA) or Unsupported Request (UR).
 
@@ -51,12 +51,12 @@ Future bus technology with richer I/O Atomics Operation Support
 
 * `GenZ <http://genzconsortium.org/faq/gen-z-technology/#33>`_
 
-New PCIe Endpoints with support beyond AMD Ryzen and EPIC CPU; Intel Haswell or newer CPU’s with PCIe Generation 3.0 support.
+New PCIe Endpoints with support beyond AMD Ryzen and EPIC CPU; Intel Haswell or newer CPU's with PCIe Generation 3.0 support.
 
 * `Mellanox Bluefield SOC <http://www.mellanox.com/related-docs/npu-multicore-processors/PB_Bluefield_SoC.pdf>`_
 * `Cavium Thunder X2 <http://www.cavium.com/ThunderX2_ARM_Processors.html>`_
 
-In ROCm, we also take advantage of PCIe ID based ordering technology for P2P when the GPU originates two writes to two different targets:  
+In ROCm, we also take advantage of PCIe ID based ordering technology for P2P when the GPU originates two writes to two different targets:
 
 1. write to another GPU memory
 2. then write to system memory to indicate transfer complete
@@ -86,36 +86,36 @@ For GFX9 and Vega10 which have Physical Address up 44 bit and 48 bit Virtual add
 * BAR4 register: Optional, not a boot device.
 * BAR5 register: 32bit, non-prefetchable, MMIO. Must be placed < 4GB.
 
-Here is how our BAR works on GFX 8 GPU’s with 40 bit Physical Address Limit
+Here is how our BAR works on GFX 8 GPU's with 40 bit Physical Address Limit
 ::
 
   11:00.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Fiji [Radeon R9 FURY / NANO Series] (rev c1)
-  
+
   Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0b35
-    
+
   Flags: bus master, fast devsel, latency 0, IRQ 119
-    
+
   Memory at bf40000000 (64-bit, prefetchable) [size=256M]
-   
+
   Memory at bf50000000 (64-bit, prefetchable) [size=2M]
-   
+
   I/O ports at 3000 [size=256]
-   
+
   Memory at c7400000 (32-bit, non-prefetchable) [size=256K]
-   
+
   Expansion ROM at c7440000 [disabled] [size=128K]
 
 Legend:
 
-**1** : GPU Frame Buffer BAR – In this example it happens to be 256M, but typically this will be size of the GPU memory (typically 4GB+). This BAR has to be placed < 2^40 to allow peer-to-peer access from other GFX8 AMD GPUs. For GFX9 (Vega GPU) the BAR has to be placed < 2^44 to allow peer-to-peer access from other GFX9 AMD GPUs.
+**1** : GPU Frame Buffer BAR - In this example it happens to be 256M, but typically this will be size of the GPU memory (typically 4GB+). This BAR has to be placed < 2^40 to allow peer-to-peer access from other GFX8 AMD GPUs. For GFX9 (Vega GPU) the BAR has to be placed < 2^44 to allow peer-to-peer access from other GFX9 AMD GPUs.
 
-**2** : Doorbell BAR – The size of the BAR is typically will be < 10MB (currently fixed at 2MB) for this generation GPUs. This BAR has to be placed < 2^40 to allow peer-to-peer access from other current generation AMD GPUs.
+**2** : Doorbell BAR - The size of the BAR is typically will be < 10MB (currently fixed at 2MB) for this generation GPUs. This BAR has to be placed < 2^40 to allow peer-to-peer access from other current generation AMD GPUs.
 
 **3** : IO BAR - This is for legacy VGA and boot device support, but since this the GPUs in this project are not VGA devices (headless), this is not a concern even if the SBIOS does not setup.
 
-**4** : MMIO BAR – This is required for the AMD Driver SW to access the configuration registers. Since the reminder of the BAR available is only 1 DWORD (32bit), this is placed < 4GB. This is fixed at 256KB.
+**4** : MMIO BAR - This is required for the AMD Driver SW to access the configuration registers. Since the reminder of the BAR available is only 1 DWORD (32bit), this is placed < 4GB. This is fixed at 256KB.
 
-**5** : Expansion ROM – This is required for the AMD Driver SW to access the GPU’s video-bios. This is currently fixed at 128KB.
+**5** : Expansion ROM - This is required for the AMD Driver SW to access the GPU's video-bios. This is currently fixed at 128KB.
 
 ===============================================================
 Excepts form Overview of Changes to PCI Express 3.0
@@ -126,20 +126,20 @@ By Mike Jackson, Senior Staff Architect, MindShare, Inc.
 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
 
-Atomic Operations – Goal:
+Atomic Operations - Goal:
 -------------------------
 Support SMP-type operations across a PCIe network to allow for things like offloading tasks between CPU cores and accelerators like a GPU. The spec says this enables advanced synchronization mechanisms that are particularly useful with multiple producers or consumers that need to be synchronized in a non-blocking fashion. Three new atomic non-posted requests were added, plus the corresponding completion (the address must be naturally aligned with the operand size or the TLP is malformed):
 
-* Fetch and Add – uses one operand as the “add” value. Reads the target location, adds the operand, and then writes the result back to the original location.
-* Unconditional Swap – uses one operand as the “swap” value. Reads the target location and then writes the swap value to it.
-* Compare and Swap – uses 2 operands: first data is compare value, second is swap value. Reads the target location, checks it against the compare value and, if equal, writes the swap value to the target location.
-* AtomicOpCompletion – new completion to give the result so far atomic request and indicate that the atomicity of the transaction has been maintained.
+* Fetch and Add - uses one operand as the "add" value. Reads the target location, adds the operand, and then writes the result back to the original location.
+* Unconditional Swap - uses one operand as the "swap" value. Reads the target location and then writes the swap value to it.
+* Compare and Swap - uses 2 operands: first data is compare value, second is swap value. Reads the target location, checks it against the compare value and, if equal, writes the swap value to the target location.
+* AtomicOpCompletion - new completion to give the result so far atomic request and indicate that the atomicity of the transaction has been maintained.
 
-Since AtomicOps are not locked they don’t have the performance downsides of the PCI locked protocol. Compared to locked cycles, they provide “lower latency, higher scalability, advanced synchronization algorithms, and dramatically lower impact on other PCIe traffic.” The lock mechanism can still be used across a bridge to PCI or PCI-X to achieve the desired operation.
+Since AtomicOps are not locked they don't have the performance downsides of the PCI locked protocol. Compared to locked cycles, they provide "lower latency, higher scalability, advanced synchronization algorithms, and dramatically lower impact on other PCIe traffic." The lock mechanism can still be used across a bridge to PCI or PCI-X to achieve the desired operation.
 
 AtomicOps can go from device to device, device to host, or host to device. Each completer indicates whether it supports this capability and guarantees atomic access if it does. The ability to route AtomicOps is also indicated in the registers for a given port.
 
-ID-based Ordering – Goal:
+ID-based Ordering - Goal:
 -------------------------
 Improve performance by avoiding stalls caused by ordering rules. For example, posted writes are never normally allowed to pass each other in a queue, but if they are requested by different functions, we can have some confidence that the requests are not dependent on each other. The previously reserved Attribute bit [2] is now combined with the RO bit to indicate ID ordering with or without relaxed ordering.
 
diff --git a/GCN_ISA_Manuals/caffe.rst b/GCN_ISA_Manuals/caffe.rst
index 3eef79e3..070603b6 100644
--- a/GCN_ISA_Manuals/caffe.rst
+++ b/GCN_ISA_Manuals/caffe.rst
@@ -30,38 +30,38 @@ Installing ROCm Debian packages:
 ::
 
   PKG_REPO="http://repo.radeon.com/rocm/apt/debian/"
-   
+
   wget -qO - $PKG_REPO/rocm.gpg.key | sudo apt-key add -
-  
+
   sudo sh -c "echo deb [arch=amd64] $PKG_REPO xenial main > /etc/apt/sources.list.d/rocm.list"
- 
+
   sudo apt-get update
-  
+
   sudo apt-get install rocm rocm-utils rocm-opencl rocm-opencl-dev rocm-profiler cxlactivitylogger
 
   echo 'export PATH=/opt/rocm/bin:$PATH' >> $HOME/.bashrc
-  
+
   echo 'export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH' >> $HOME/.bashrc
 
   source $HOME/.bashrc
-  
+
   sudo reboot
-  
+
 Then, verify the installation. Double-check your kernel (at a minimum, you should see "kfd" in the name)::
- 
+
    uname -r
-  
+
 In addition, check that you can run the simple HSA vector_copy sample application::
-  
+
   cd /opt/rocm/hsa/sample
   make
   ./vector_copy
-  
+
 Pre-requisites Installation
 ++++++++++++++++++++++++++++
 
 Install Caffe dependencies::
- 
+
  sudo apt-get install \
  	pkg-config \
  	protobuf-compiler \
@@ -78,24 +78,24 @@ Install Caffe dependencies::
  	libopencv-dev \
  	libfftw3-dev \
  	libelf-dev
- 
+
 
 Install the necessary ROCm compute libraries::
- 
+
  sudo apt-get install rocm-libs miopen-hip miopengemm
 
 hipCaffe Build Steps
 +++++++++++++++++++++
 Clone hipCaffe::
- 
- git clone https://github.com/ROCmSoftwarePlatform/hipCaffe.git 
- 
+
+ git clone https://github.com/ROCmSoftwarePlatform/hipCaffe.git
+
  cd hipCaffe
- 
+
 You may need to modify the Makefile.config file for your own installation. Then, build it::
- 
+
  cp ./Makefile.config.example ./Makefile.config
- make 
+ make
 
 To improve build time, consider invoking parallel make with the "-j$(nproc)" flag.
 
@@ -103,7 +103,7 @@ Unit Testing
 -------------
 
 Run the following commands to perform unit testing of different components of Caffe.
-:: 
+::
  make test
  ./build/test/test_all.testbin
 
@@ -114,7 +114,7 @@ MNIST training
 ++++++++++++++++
 
 Steps::
- 
+
     ./data/mnist/get_mnist.sh
     ./examples/mnist/create_mnist.sh
     ./examples/mnist/train_lenet.sh
@@ -123,7 +123,7 @@ CIFAR-10 training
 ++++++++++++++++++
 
 Steps::
- 
+
     ./data/cifar10/get_cifar10.sh
     ./examples/cifar10/create_cifar10.sh
     ./build/tools/caffe train --solver=examples/cifar10/cifar10_quick_solver.prototxt
@@ -163,7 +163,7 @@ Sometimes when training with multiple GPUs, we hit this type of error signature:
      @           0x8015c3 caffe::Solver<>::Solve()
      @           0x71a277 caffe::P2PSync<>::Run()
      @           0x42dcbc train()
- 
+
 
 See this `comment <https://github.com/ROCmSoftwarePlatform/hipCaffe/issues/11#issuecomment-318518802>`_.
 
diff --git a/GCN_ISA_Manuals/testdocbook.rst b/GCN_ISA_Manuals/testdocbook.rst
index af71ad61..0154b02d 100644
--- a/GCN_ISA_Manuals/testdocbook.rst
+++ b/GCN_ISA_Manuals/testdocbook.rst
@@ -101,7 +101,7 @@ Summary of kernel instruction changes in Vega GPUs:
 -  New packed 16-bit math instructions.
 
    ::
-    
+
     V_PK_MAD_I16       V_PK_MUL_LO_U16    V_PK_ADD_I16       V_PK_SUB_I16
     V_PK_LSHLREV_B16   V_PK_LSHRREV_B16   V_PK_ASHRREV_I16   V_PK_MAX_I16
     V_PK_MIN_I16       V_PK_MAD_U16       V_PK_ADD_U16       V_PK_SUB_U16
@@ -159,7 +159,7 @@ The figure below shows a block diagram of the AMD GCN Vega Generation series pro
 
 AMD GCN VEGA Generation Series Block Diagram
 
-The GCN device includes a data-parallel processor (DPP) array, a command processor, a memory controller, and other logic (not shown). The GCN command processor reads commands that the host has written to memory-mapped GCN registers in the system-memory address space. The command processor sends hardware-generated interrupts to the host when the command is completed. The GCN memory controller has direct access to all GCN device memory and the host-specified areas of system memory. To satisfy read and write requests, the memory controller performs the functions of a direct-memory access (DMA) controller, including computing memory-address offsets based on the format of the requested data in memory. In the GCN environment, a complete application includes two parts: 
+The GCN device includes a data-parallel processor (DPP) array, a command processor, a memory controller, and other logic (not shown). The GCN command processor reads commands that the host has written to memory-mapped GCN registers in the system-memory address space. The command processor sends hardware-generated interrupts to the host when the command is completed. The GCN memory controller has direct access to all GCN device memory and the host-specified areas of system memory. To satisfy read and write requests, the memory controller performs the functions of a direct-memory access (DMA) controller, including computing memory-address offsets based on the format of the requested data in memory. In the GCN environment, a complete application includes two parts:
 -  a program running on the host processor, and
 
 -  programs, called kernels, running on the GCN processor.
@@ -175,16 +175,16 @@ The GCN programs are controlled by host commands that
 -  cause the GCN GPU to begin execution of a program.
 
 The GCN driver program runs on the host.
- 
+
 The DPP array is the heart of the GCN processor. The array is organized as a set of compute unit pipelines, each independent from the others, that operate in parallel on streams of floating-point or integer data.The compute unit pipelines can process data or, through the memory controller, transfer data to, or from, memory. Computation in a compute unit pipeline can be made conditional. Outputs written to memory can also be made conditional.
 
-When it receives a request, the compute unit pipeline loads instructions and data from memory, begins execution, and continues until the end of the kernel. As kernels are running, the GCN hardware automatically fetches instructions from memory into on-chip caches; GCN software plays no role in this. GCN kernels can load data from off-chip memory into on-chip general-purpose registers (GPRs) and caches. 
+When it receives a request, the compute unit pipeline loads instructions and data from memory, begins execution, and continues until the end of the kernel. As kernels are running, the GCN hardware automatically fetches instructions from memory into on-chip caches; GCN software plays no role in this. GCN kernels can load data from off-chip memory into on-chip general-purpose registers (GPRs) and caches.
 
 The AMD GCN devices can detect floating point exceptions and can generate interrupts. In particular, they detect IEEE floating-point
 exceptions in hardware; these can be recorded for post-execution analysis. The software interrupts shown in the previous figure from the command processor to the host represent hardware-generated interrupts for signaling command-completion and related management functions.
 
 The GCN processor hides memory latency by keeping track of potentially hundreds of work-items in different stages of execution, and by
-overlapping compute operations with memory-access operations. 
+overlapping compute operations with memory-access operations.
 
 The figure below shows the dataflow for a GCN application. For general-purpose applications, only one processing block performs all
 computation.
@@ -247,7 +247,7 @@ Terminology
 |                       | address, data format, stride, etc.                 |
 +-----------------------+----------------------------------------------------+
 
-	**Table : Basic Terms Uses** 
+	**Table : Basic Terms Uses**
 
 Program Organization
 ====================
@@ -701,7 +701,7 @@ SGPR Allocation and storage
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 A wavefront can be allocated 16 to 102 SGPRs, in units of 16 GPRs (Dwords). These are logically viewed as SGPRs 0-101. The VCC is
-physically stored as part of the wavefront’s SGPRs in the highest numbered two SGPRs (SGPR 106 and 107; the source/destination VCC is an alias for those two SGPRs). When a trap handler is present, 16 additional SGPRs are reserved after VCC to hold the trap addresses, as well as saved-PC and trap-handler temps. These all are privileged (cannot be written to unless privilege is set). Note that if a wavefront allocates 16 SGPRs, 2 SGPRs are normally used as VCC, the remaining 14 are available to the shader. Shader hardware does not prevent use of all 16 SGPRs.
+physically stored as part of the wavefront's SGPRs in the highest numbered two SGPRs (SGPR 106 and 107; the source/destination VCC is an alias for those two SGPRs). When a trap handler is present, 16 additional SGPRs are reserved after VCC to hold the trap addresses, as well as saved-PC and trap-handler temps. These all are privileged (cannot be written to unless privilege is set). Note that if a wavefront allocates 16 SGPRs, 2 SGPRs are normally used as VCC, the remaining 14 are available to the shader. Shader hardware does not prevent use of all 16 SGPRs.
 
 SGPR Alignment
 ~~~~~~~~~~~~~~
@@ -736,13 +736,13 @@ for:
 
 -  Local Data Share (LDS)
 
-   -  Interpolation: holds { 1’b0, new\_prim\_mask[15:1],
+   -  Interpolation: holds { 1'b0, new\_prim\_mask[15:1],
       parameter\_offset[15:0] } // in bytes
 
-   -  LDS direct-read offset and data type: { 13’b0, DataType[2:0],
+   -  LDS direct-read offset and data type: { 13'b0, DataType[2:0],
       LDS\_address[15:0] } // addr in bytes
 
-   -  LDS addressing for Memory/Vfetch → LDS: {16’h0, lds\_offset[15:0]}
+   -  LDS addressing for Memory/Vfetch -> LDS: {16'h0, lds\_offset[15:0]}
       // in bytes
 
 -  Global Data Share (GDS)
@@ -791,7 +791,7 @@ The EXEC mask determines which threads execute an instruction. The VCC indicates
 
    <div class="informalexample">
 
-V\_CMP\_\* ⇒ VCC[n] = EXEC[n] & (test passed for thread[n])
+V\_CMP\_\* => VCC[n] = EXEC[n] & (test passed for thread[n])
 
 .. raw:: html
 
@@ -811,7 +811,7 @@ SGPRs that happen to hold VCC).
 Trap and Exception registers
 ----------------------------
 
-Each type of exception can be enabled or disabled independently by setting, or clearing, bits in the TRAPSTS register’s EXCP\_EN field.This section describes the registers which control and report kernel exceptions.
+Each type of exception can be enabled or disabled independently by setting, or clearing, bits in the TRAPSTS register's EXCP\_EN field.This section describes the registers which control and report kernel exceptions.
 
 All Trap temporary SGPRs (TTMP\*) are privileged for writes - they can be written only when in the trap handler (status.priv = 1). When not privileged, writes to these are ignored. TMA and TBA are read-only; they can be accessed through S\_GETREG\_B32.
 
@@ -829,7 +829,7 @@ PC of the faulting instruction will be: (PC - PC\_rewind\*4).
 
 **STATUS . TRAP\_EN** - This bit indicates to the shader whether or not
 a trap handler is present. When one is not present, traps are not taken,
-no matter whether they’re floating point, user-, or host-initiated
+no matter whether they're floating point, user-, or host-initiated
 traps. When the trap handler is present, the wavefront uses an extra 16
 SGPRs for trap processing. If trap\_en == 0, all traps and exceptions
 are ignored, and s\_trap is converted by hardware to NOP.
@@ -940,7 +940,7 @@ Memory violations are not reported for instruction or scalar-data accesses.
 
 Memory Buffer to LDS does NOT return a memory violation if the LDS address is out of range, but masks off EXEC bits of threads that would go out of range.
 
-When a memory access is in violation, the appropriate memory (LDS or TC) returns MEM\_VIOL to the wave. This is stored in the wave’s
+When a memory access is in violation, the appropriate memory (LDS or TC) returns MEM\_VIOL to the wave. This is stored in the wave's
 TRAPSTS.mem\_viol bit. This bit is sticky, so once set to 1, it remains at 1 until the user clears it.
 
 There is a corresponding exception enable bit (EXCP\_EN.mem\_viol). If this bit is set when the memory returns with a violation, the wave jumps to the trap handler.
@@ -1235,7 +1235,7 @@ This method compares how many of the 64 threads go down the PASS path instead of
 The following pseudo-code shows the details of CBRANCH Fork and Join operations.
 
 ::
-   
+
     S_CBRANCH_G_FORK arg0, arg1
         // arg1 is an sgpr-pair which holds 64bit (48bit) target address
 
@@ -1270,8 +1270,8 @@ The following pseudo-code shows the details of CBRANCH Fork and Join operations.
     else
         CSP -- // this is the 1st time to JOIN: jump to other FORK path
         {PC, EXEC} = SGPR[CSP*4] // read 128-bits from 4 consecutive SGPRs
-    
-    
+
+
 
 Scalar ALU Operations
 =====================
@@ -1550,7 +1550,7 @@ comparison yielded a TRUE result.
 | S\_BITCMP1\_{B32,B64 | SOPC     | y        | Test for "is a bit one". SCC =     |
 | }                    |          |          | S0[S1].                            |
 +----------------------+----------+----------+------------------------------------+
-	
+
 	**Table : Conditional Instructions**
 
 Bit-Wise Instructions
@@ -1628,7 +1628,7 @@ below, SCC is set if the result is nonzero.
 | | S\_FLBIT\_I32           | SOP1    | n      | | Count how many bits in a row   |
 | | S\_FLBIT\_I32\_I64      |         |        |   (from MSB to LSB) are the same |
 |                           |         |        |   as the sign bit. Return -1 if  |
-|                           |         |        |   the input is zero or all 1’s   |
+|                           |         |        |   the input is zero or all 1's   |
 |                           |         |        |   (-1). 32-bit pseudo-code:      |
 |                           |         |        | | if (S0 == 0 \|\| S0 == -1) D = |
 |                           |         |        |   -1                             |
@@ -1973,7 +1973,7 @@ bits; codes 0 to 255 can be the scalar source if it is eight bits; codes
 +-----------+--------------------+-----------------------------------------------+
 | 236       | SHARED\_LIMIT      |                                               |
 +-----------+--------------------+-----------------------------------------------+
-| 237       | PRIVATE\_BASE      |                                               |      
+| 237       | PRIVATE\_BASE      |                                               |
 +-----------+--------------------+-----------------------------------------------+
 | 238       | PRIVATE\_LIMIT     |                                               |
 +-----------+--------------------+-----------------------------------------------+
@@ -1994,7 +1994,7 @@ bits; codes 0 to 255 can be the scalar source if it is eight bits; codes
 +-----------+--------------------+-----------------------------------------------+
 | 243       | -1.0               |                                               |
 +-----------+--------------------+-----------------------------------------------+
-| 244       | 2.0                |                                               |       
+| 244       | 2.0                |                                               |
 +-----------+--------------------+-----------------------------------------------+
 | 245       | -2.0               |                                               |
 +-----------+--------------------+-----------------------------------------------+
@@ -2200,7 +2200,7 @@ encoding.
 
 Table: VALU Instruction Set
 
-| 
+|
 | The next table lists the compare instructions.
 
 +----------------+----------------+------------------------------+------------------------------+
@@ -2209,7 +2209,7 @@ Table: VALU Instruction Set
 | V\_CMP         | I16, I32, I64, | F, LT, EQ, LE, GT, LG, GE, T | Write VCC..                  |
 |                | U16, U32, U64  |                              |                              |
 +----------------+----------------+------------------------------+------------------------------+
-| V\_CMPX        | Write VCC and  |                              |                              | 
+| V\_CMPX        | Write VCC and  |                              |                              |
 |                | exec.          |                              |                              |
 +----------------+----------------+------------------------------+------------------------------+
 | V\_CMP         | F16, F32, F64  | | F, LT, EQ,LE, GT, LG, GE,  | Write VCC.                   |
@@ -2823,7 +2823,7 @@ VGPRs.
 | | TBUFFER\_STORE\_FORMAT\_{x, |                                            |
 |   xy,xyz,xyzw}                |                                            |
 +-------------------------------+--------------------------------------------+
-| MUBUF Instructions            |                                            | 
+| MUBUF Instructions            |                                            |
 +-------------------------------+--------------------------------------------+
 | | BUFFER\_LOAD\_FORMAT\_{x,xy | | Read to, or write from, an untyped       |
 |   ,xyz,xyzw}                  |   buffer object.                           |
@@ -3051,14 +3051,14 @@ Dst\_sel comes from the resource, but is ignored for many operations.
 
 Table: Buffer Instructions
 
-**Instruction** : The instruction’s dfmt and nfmt fields are used
-instead of the resource’s fields.
+**Instruction** : The instruction's dfmt and nfmt fields are used
+instead of the resource's fields.
 
 **Data format derived** : The data format is derived from the opcode and
 ignores the resource definition. For example, buffer\_load\_ubyte sets
 the data-format to 8 and number-format to uint.
 
-.. note:: The resource’s data format must not be INVALID; that format has special meaning (unbound resource), and for that case the data format is not replaced by the instruction’s implied data format.
+.. note:: The resource's data format must not be INVALID; that format has special meaning (unbound resource), and for that case the data format is not replaced by the instruction's implied data format.
 
 **DST\_SEL identity** : Depending on the number of components in the
 data-format, this is: X000, XY00, XYZ0, or XYZW.
@@ -3271,7 +3271,7 @@ Swizzled Buffer Addressing
 Swizzled addressing rearranges the data in the buffer to help provide
 improved cache locality for arrays of structures. Swizzled addressing
 also requires Dword-aligned accesses. A single fetch instruction cannot
-attempt to fetch a unit larger than const-element-size. The buffer’s
+attempt to fetch a unit larger than const-element-size. The buffer's
 STRIDE must be a multiple of element\_size.
 
 ::
@@ -3434,7 +3434,7 @@ the following subset of MUBUF instructions.
 
 -  BUFFER\_LOAD\_{ubyte, sbyte, ushort, sshort, dword, format\_x}.
 
--  It is illegal to set the instruction’s TFE bit for loads to LDS.
+-  It is illegal to set the instruction's TFE bit for loads to LDS.
 
 .. raw:: html
 
@@ -3783,19 +3783,19 @@ image opcodes.
 | 1                  | 1D     | x           | slice     |           |           |           |
 |                    | Array  |             |           |           |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
-| 1                  | 2D     | x           | y         |           |           |           | 
+| 1                  | 2D     | x           | y         |           |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
 | 2                  | 2D     | x           | y         | fragid    |           |           |
 |                    | MSAA   |             |           |           |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
-| 2                  | 2D     | x           | y         | slice     |           |           | 
-|                    | Array  |             |           |           |           |           | 
+| 2                  | 2D     | x           | y         | slice     |           |           |
+|                    | Array  |             |           |           |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
 | 3                  | 2D     | x           | y         | slice     | fragid    |           |
 |                    | Array  |             |           |           |           |           |
-|                    | MSAA   |             |           |           |           |           | 
+|                    | MSAA   |             |           |           |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
-| 2                  | 3D     | x           | y         | z         |           |           | 
+| 2                  | 3D     | x           | y         | z         |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
 | 2                  | Cube   | x           | y         | face\_id  |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
@@ -3807,7 +3807,7 @@ image opcodes.
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
 | 2                  | 2D     | x           | y         | mipid     |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
-| 3                  | 2D     | x           | y         | slice     | mipid     |           | 
+| 3                  | 2D     | x           | y         | slice     | mipid     |           |
 |                    | Array  |             |           |           |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
 | 3                  | 3D     | x           | y         | z         | mipid     |           |
@@ -3847,11 +3847,11 @@ gradients.
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
 | 2                  | 3D     | x           | y         | z         |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
-| 2                  | Cube   | x           | y         | face\_id  |           |           | 
+| 2                  | Cube   | x           | y         | face\_id  |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
 | sample\_l          | 1      | 1D          | x         | lod       |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
-| 2                  | 1D     | x           | slice     | lod       |           |           | 
+| 2                  | 1D     | x           | slice     | lod       |           |           |
 |                    | Array  |             |           |           |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
 | 2                  | 2D     | x           | y         | lod       |           |           |
@@ -3860,20 +3860,20 @@ gradients.
 |                    | interl |             |           |           |           |           |
 |                    | aced   |             |           |           |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
-| 3                  | 2D     | x           | y         | slice     | lod       |           | 
+| 3                  | 2D     | x           | y         | slice     | lod       |           |
 |                    | Array  |             |           |           |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
-| 3                  | 3D     | x           | y         | z         | lod       |           | 
+| 3                  | 3D     | x           | y         | z         | lod       |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
 | 3                  | Cube   | x           | y         | face\_id  | lod       |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
 | sample\_cl         | 1      | 1D          | x         | clamp     |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
 | 2                  | 1D     | x           | slice     | clamp     |           |           |
-|                    | Array  |             |           |           |           |           | 
+|                    | Array  |             |           |           |           |           |
 | 2                  | 2D     | x           | y         | clamp     |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
-| 3                  | 2D     | x           | y         | field     | clamp     |           |   
+| 3                  | 2D     | x           | y         | field     | clamp     |           |
 |                    | interl |             |           |           |           |           |
 |                    | aced   |             |           |           |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
@@ -3882,7 +3882,7 @@ gradients.
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
 | 3                  | 3D     | x           | y         | z         | clamp     |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
-| 3                  | Cube   | x           | y         | face\_id  | clamp     |           | 
+| 3                  | Cube   | x           | y         | face\_id  | clamp     |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
 | gather4            | 1      | 2D          | x         | y         |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
@@ -3893,29 +3893,29 @@ gradients.
 | 2                  | 2D     | x           | y         | slice     |           |           |
 |                    | Array  |             |           |           |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
-| 2                  | Cube   | x           | y         | face\_id  |           |           | 
+| 2                  | Cube   | x           | y         | face\_id  |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
 | gather4\_l         | 2      | 2D          | x         | y         | lod       |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
-| 3                  | 2D     | x           | y         | field     | lod       |           |     
+| 3                  | 2D     | x           | y         | field     | lod       |           |
 |                    | interl |             |           |           |           |           |
 |                    | aced   |             |           |           |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
 | 3                  | 2D     | x           | y         | slice     | lod       |           |
 |                    | Array  |             |           |           |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
-| 3                  | Cube   | x           | y         | face\_id  | lod       |           | 
+| 3                  | Cube   | x           | y         | face\_id  | lod       |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
 | gather4\_cl        | 2      | 2D          | x         | y         | clamp     |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
 | 3                  | 2D     | x           | y         | field     | clamp     |           |
-|                    | interl |             |           |           |           |           | 
+|                    | interl |             |           |           |           |           |
 |                    | aced   |             |           |           |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
 | 3                  | 2D     | x           | y         | slice     | clamp     |           |
-|                    | Array  |             |           |           |           |           | 
+|                    | Array  |             |           |           |           |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
-| 3                  | Cube   | x           | y         | face\_id  | clamp     |           | 
+| 3                  | Cube   | x           | y         | face\_id  | clamp     |           |
 +--------------------+--------+-------------+-----------+-----------+-----------+-----------+
 
 Table: Image Opcodes with Sampler
@@ -3954,7 +3954,7 @@ instructions:
 |                    |                    |                    | for it to used in  |
 |                    |                    |                    | LOD computation.   |
 +--------------------+--------------------+--------------------+--------------------+
-| \_CD               | Coarse Derivative  | Send dx/dv, dx/dy, |                    | 
+| \_CD               | Coarse Derivative  | Send dx/dv, dx/dy, |                    |
 |                    |                    | etc. slopes to TA  |                    |
 |                    |                    | for it to used in  |                    |
 |                    |                    | LOD computation.   |                    |
@@ -4030,7 +4030,7 @@ These are all packed into consecutive VGPRs.
 -  Writes: When writing an image object, it is only possible to write an
    entire element (all components), not just individual components. The
    components come from consecutive VGPRs, and the texture system fills
-   in the value zero for any missing components of the image’s data
+   in the value zero for any missing components of the image's data
    format; it ignores any values that are not part of the stored data
    format. For example, if the DMASK=1001, the shader sends Red from
    VGPR\_N, and Alpha from VGPR\_N+1, to the texture unit. If the image
@@ -4075,7 +4075,7 @@ MIMG instructions.
 +----------+--------+---------------+---------------------------------------------+
 | Bits     | Size   | Name          | Comments                                    |
 +==========+========+===============+=============================================+
-| **128-bi |        |               |                                             |  
+| **128-bi |        |               |                                             |
 | t        |        |               |                                             |
 | Resource |        |               |                                             |
 | :        |        |               |                                             |
@@ -4102,16 +4102,16 @@ MIMG instructions.
 +----------+--------+---------------+---------------------------------------------+
 | 91:78    | 14     | height        | height-1 of mip0 in texels                  |
 +----------+--------+---------------+---------------------------------------------+
-| 94:92    | 3      | perf          | Scales sampler’s perf\_z, perf\_mip,        |
+| 94:92    | 3      | perf          | Scales sampler's perf\_z, perf\_mip,        |
 |          |        | modulation    | aniso\_bias, lod\_bias\_sec.                |
 +----------+--------+---------------+---------------------------------------------+
 | 98:96    | 3      | dst\_sel\_x   | 0 = 0, 1 = 1, 4 = R, 5 = G, 6 = B, 7 = A.   |
 +----------+--------+---------------+---------------------------------------------+
-| 101:99   | 3      | dst\_sel\_y   |                                             | 
+| 101:99   | 3      | dst\_sel\_y   |                                             |
 +----------+--------+---------------+---------------------------------------------+
 | 104:102  | 3      | dst\_sel\_z   |                                             |
-+----------+--------+---------------+---------------------------------------------+ 
-| 107:105  | 3      | dst\_sel\_w   |                                             | 
++----------+--------+---------------+---------------------------------------------+
+| 107:105  | 3      | dst\_sel\_w   |                                             |
 +----------+--------+---------------+---------------------------------------------+
 | 111:108  | 4      | base level    | largest mip level in the resource view. For |
 |          |        |               | msaa, set to zero.                          |
@@ -4129,7 +4129,7 @@ MIMG instructions.
 |          |        |               | 2d-msaa, 15 = 2d-msaa-array. 1-7 are        |
 |          |        |               | reserved.                                   |
 +----------+--------+---------------+---------------------------------------------+
-| **256-bi |        |               |                                             |        
+| **256-bi |        |               |                                             |
 | t        |        |               |                                             |
 | Resource |        |               |                                             |
 | :        |        |               |                                             |
@@ -4181,7 +4181,7 @@ MIMG instructions.
 | 213      | 1      | Compression   | enable delta color compression              |
 |          |        | Enable        |                                             |
 +----------+--------+---------------+---------------------------------------------+
-| 214      | 1      | Alpha is on   | Set to 1 if the surface’s component swap is |
+| 214      | 1      | Alpha is on   | Set to 1 if the surface's component swap is |
 |          |        | MSB           | not reversed (DCC)                          |
 +----------+--------+---------------+---------------------------------------------+
 | 215      | 1      | Color         | Auto=0, none=1 (DCC)                        |
@@ -4217,7 +4217,7 @@ with every sample instruction.
 +====================+====================+====================+====================+
 | 2:0                | 3                  | clamp x            | Clamp/wrap mode.   |
 +--------------------+--------------------+--------------------+--------------------+
-| 5:3                | 3                  | clamp y            |                    |  
+| 5:3                | 3                  | clamp y            |                    |
 +--------------------+--------------------+--------------------+--------------------+
 | 8:6                | 3                  | clamp z            |                    |
 +--------------------+--------------------+--------------------+--------------------+
@@ -4312,11 +4312,11 @@ VGPRs and sent to the texture cache. Any texture or buffer resources and
 samplers are also sent immediately. However, write-data is not
 immediately sent to the texture cache.
 
-The shader developer’s responsibility to avoid data hazards associated
+The shader developer's responsibility to avoid data hazards associated
 with VMEM instructions include waiting for VMEM read instruction
 completion before reading data fetched from the TC (VMCNT).
 
-This is explained in the section: 
+This is explained in the section:
 
 :ref:`Vector Memory Operations`
 
@@ -4516,10 +4516,10 @@ Table: Flat, Global and Scratch Microcode Formats
 +-------------------------+-------------------------+--------------------------+
 | FLAT\_ATOMIC\_DEC       | GLOBAL\_ATOMIC\_DEC     | none                     |
 +-------------------------+-------------------------+--------------------------+
-| The atomic instructions |                         |			       | 
-| above are also          |                         |			       | 
-| available in "\_X2"     |                         |			       | 
-| versions (64-bit).      |                         |			       | 
+| The atomic instructions |                         |			       |
+| above are also          |                         |			       |
+| available in "\_X2"     |                         |			       |
+| versions (64-bit).      |                         |			       |
 +-------------------------+-------------------------+--------------------------+
 
 Table: Flat, Global and Scratch Opcodes
@@ -4657,7 +4657,7 @@ The policy for threads with bad addresses is: writes outside this range
 do not write a value, and reads return zero.
 
 Addressing errors from either LDS or TA are returned on their respective
-"instruction done" busses as MEM\_VIOL. This sets the wave’s MEM\_VIOL
+"instruction done" busses as MEM\_VIOL. This sets the wave's MEM\_VIOL
 TrapStatus bit and causes an exception (trap) if the corresponding
 EXCPEN bit is set.
 
@@ -4741,9 +4741,9 @@ memory structure.
 |fig 10 2|
 
 To load data into LDS from global memory, it is read from global memory
-and placed into the work-item’s registers; then, a store is performed to
+and placed into the work-item's registers; then, a store is performed to
 LDS. Similarly, to store data into global memory, data is read from LDS
-and placed into the workitem’s registers, then placed into global
+and placed into the workitem's registers, then placed into global
 memory. To make effective use of the LDS, an algorithm must perform many
 operations on what is transferred between global memory and LDS. It also
 is possible to load data from a memory buffer directly into LDS,
@@ -4851,7 +4851,7 @@ number (0 to 32) and the component number (0=x, 1=y, 2=z and 3=w).
 |             |             | v\_interp\_p1 as a macro of two instructions.    |
 +-------------+-------------+--------------------------------------------------+
 | ( M0 )      | 32          | Use of the M0 register is automatic. M0 must     |
-|             |             | contain: { 1’b0, new\_prim\_mask[15:1],          |
+|             |             | contain: { 1'b0, new\_prim\_mask[15:1],          |
 |             |             | lds\_param\_offset[15:0] }                       |
 +-------------+-------------+--------------------------------------------------+
 
@@ -4906,7 +4906,7 @@ The table below lists and briefly describes the LDS instruction fields.
 |             |             | ops treat the offset as a 16-bit signed Dword    |
 |             |             | offset.                                          |
 +-------------+-------------+--------------------------------------------------+
-| OFFSET1     | 8           |                                                  | 
+| OFFSET1     | 8           |                                                  |
 +-------------+-------------+--------------------------------------------------+
 | VDST        | 8           | VGPR to which result is written: either from     |
 |             |             | LDS-load or atomic return value.                 |
@@ -5120,7 +5120,7 @@ The export instruction uses the EXP microcode format.
 |                         |                         | | MRT: vsrc0=R, 1=G,     |
 |                         |                         |   2=B, 3=A               |
 +-------------------------+-------------------------+--------------------------+
-| VSRC2                   | 8                       |                          |    
+| VSRC2                   | 8                       |                          |
 +-------------------------+-------------------------+--------------------------+
 | VSRC1                   | 8                       |                          |
 +-------------------------+-------------------------+--------------------------+
@@ -5188,7 +5188,7 @@ Multiple export instructions can be outstanding at one time. Exports of
 the same type (for example: position) are completed in order, but
 exports of different types can be completed out of order.
 
-If the STATUS register’s SKIP\_EXPORT bit is set to one, the hardware
+If the STATUS register's SKIP\_EXPORT bit is set to one, the hardware
 treats all EXPORT instructions as if they were NOPs.
 
 Instructions
@@ -6132,7 +6132,7 @@ send data from the SIMM16 field and in some cases from EXEC.
 |            |            | 2=emit,    |                                           |
 |            |            | 3=emit-cut |                                           |
 +------------+------------+------------+-------------------------------------------+
-| GS-done    | 3          |            |                                           |  
+| GS-done    | 3          |            |                                           |
 +------------+------------+------------+-------------------------------------------+
 | save wave  | 4          | -          | used in context switching                 |
 +------------+------------+------------+-------------------------------------------+
@@ -7292,7 +7292,7 @@ The bitfield map for VOPC is:
         for which the bitfield is:
 
 Compare instructions perform the same compare operation on each lane
-(workItem or thread) using that lane’s private data, and producing a 1
+(workItem or thread) using that lane's private data, and producing a 1
 bit result per lane into VCC or EXEC.
 
 Instructions in this format may use a 32-bit literal constant which
@@ -10918,14 +10918,14 @@ sections that follow provide details
 | SOP2                                    | `section\_title <#_so | 32         |
 |                                         | p2>`__                |            |
 +-----------------------------------------+-----------------------+------------+
-| SOP1                                    | `section\_title <#_so |            | 
+| SOP1                                    | `section\_title <#_so |            |
 |                                         | p1>`__                |            |
 +-----------------------------------------+-----------------------+------------+
 | SOPK                                    | `section\_title <#_so |            |
 |                                         | pk>`__                |            |
 +-----------------------------------------+-----------------------+------------+
 | SOPP                                    | `section\_title <#_so |            |
-|                                         | pp>`__                |            | 
+|                                         | pp>`__                |            |
 +-----------------------------------------+-----------------------+------------+
 | SOPC                                    | `section\_title <#_so |            |
 |                                         | pc>`__                |            |
@@ -10961,7 +10961,7 @@ sections that follow provide details
 | SDWA                                    | `section\_title <#_vo | 32         |
 |                                         | p2>`__                |            |
 +-----------------------------------------+-----------------------+------------+
-| **Vector Parameter Interpolation        |                       |            | 
+| **Vector Parameter Interpolation        |                       |            |
 | Format**                                |                       |            |
 +-----------------------------------------+-----------------------+------------+
 | VINTRP                                  | `section\_title <#_vi | 32         |
@@ -10979,7 +10979,7 @@ sections that follow provide details
 | MUBUF                                   | `section\_title <#_mu | 64         |
 |                                         | buf>`__               |            |
 +-----------------------------------------+-----------------------+------------+
-| **Vector Memory Image Format**          |                       |            |     
+| **Vector Memory Image Format**          |                       |            |
 +-----------------------------------------+-----------------------+------------+
 | MIMG                                    | `section\_title <#_mi | 64         |
 |                                         | mg>`__                |            |
@@ -12389,8 +12389,8 @@ VOP3 format.
 | Operation       | Offset  |                                                   |
 +=================+=========+===================================================+
 | Sixteen Compare |         |                                                   |
-| Operations      |         |                                                   | 
-| (OP16)          |         |                                                   |    
+| Operations      |         |                                                   |
+| (OP16)          |         |                                                   |
 +-----------------+---------+---------------------------------------------------+
 | F               | 0       | D.u = 0                                           |
 +-----------------+---------+---------------------------------------------------+
@@ -13500,7 +13500,7 @@ SDWA
 |                 |         |   the VGPR that are not selected by DST\_SEL:     |
 |                 |         | | 0 = pad with zeros + 1 = sign extend upper /    |
 |                 |         |   zero lower                                      |
-|                 |         | | 2 = preserve (don’t modify)                     |
+|                 |         | | 2 = preserve (don't modify)                     |
 |                 |         | | 3 = reserved                                    |
 +-----------------+---------+---------------------------------------------------+
 | CLMP            | [45]    | 1 = clamp result                                  |
@@ -14192,7 +14192,7 @@ MTBUF
 |                 |         | read-data.                                        |
 +-----------------+---------+---------------------------------------------------+
 | SRSRC           | [52:48] | SGPR to supply V# (resource constant) in 4 or 8   |
-|                 |         | consecutive SGPRs. It is missing 2 LSB’s of       |
+|                 |         | consecutive SGPRs. It is missing 2 LSB's of       |
 |                 |         | SGPR-address since must be aligned to 4.          |
 +-----------------+---------+---------------------------------------------------+
 | SLC             | [54]    | System level coherent: bypass L2 cache.           |
@@ -14288,7 +14288,7 @@ MUBUF
 |                 |         | read-data.                                        |
 +-----------------+---------+---------------------------------------------------+
 | SRSRC           | [52:48] | SGPR to supply V# (resource constant) in 4 or 8   |
-|                 |         | consecutive SGPRs. It is missing 2 LSB’s of       |
+|                 |         | consecutive SGPRs. It is missing 2 LSB's of       |
 |                 |         | SGPR-address since must be aligned to 4.          |
 +-----------------+---------+---------------------------------------------------+
 | TFE             | [55]    | Partially resident texture, texture fail enable.  |
@@ -14471,7 +14471,7 @@ MIMG
 |                 |         |   VGPRn+1.                                        |
 |                 |         | | For D16 writes, DMASK is only used as a word    |
 |                 |         |   count: each bit represents 16 bits of data to   |
-|                 |         |   be written starting at the LSB’s of VADDR, then |
+|                 |         |   be written starting at the LSB's of VADDR, then |
 |                 |         |   MSBs, then VADDR+1 etc. Bit position is         |
 |                 |         |   ignored.                                        |
 +-----------------+---------+---------------------------------------------------+
@@ -14518,11 +14518,11 @@ MIMG
 |                 |         | read-data.                                        |
 +-----------------+---------+---------------------------------------------------+
 | SRSRC           | [52:48] | SGPR to supply V# (resource constant) in 4 or 8   |
-|                 |         | consecutive SGPRs. It is missing 2 LSB’s of       |
+|                 |         | consecutive SGPRs. It is missing 2 LSB's of       |
 |                 |         | SGPR-address since must be aligned to 4.          |
 +-----------------+---------+---------------------------------------------------+
 | SSAMP           | [57:53] | SGPR to supply V# (resource constant) in 4 or 8   |
-|                 |         | consecutive SGPRs. It is missing 2 LSB’s of       |
+|                 |         | consecutive SGPRs. It is missing 2 LSB's of       |
 |                 |         | SGPR-address since must be aligned to 4.          |
 +-----------------+---------+---------------------------------------------------+
 | D16             | [63]    | Address offset, unsigned byte.                    |
@@ -14769,7 +14769,7 @@ FLAT
 | ENCODING        | [31:26] | Must be: 110111                                   |
 +-----------------+---------+---------------------------------------------------+
 | ADDR            | [39:32] | | VGPR which holds address or offset. For 64-bit  |
-|                 |         |   addresses, ADDR has the LSB’s and ADDR+1 has    |
+|                 |         |   addresses, ADDR has the LSB's and ADDR+1 has    |
 |                 |         |   the MSBs. For offset a single VGPR has a 32 bit |
 |                 |         |   unsigned offset.                                |
 |                 |         | | For FLAT\_\*: always specifies an address.      |
diff --git a/Installation_Guide/FAQ-on-Installation.rst b/Installation_Guide/FAQ-on-Installation.rst
index c9055017..a56cfbfa 100644
--- a/Installation_Guide/FAQ-on-Installation.rst
+++ b/Installation_Guide/FAQ-on-Installation.rst
@@ -10,7 +10,7 @@ Determining if the video card is installed correctly
 
 The ROCm software stack has specific requirements regarding the type of GPU supported and how it is installed in the system. The card must be installed in a PCIe slot that supports the 3.0 PCIe specification and the atomics extension. Preferably the slot is x16; x8 an x4 slots will work, but data transfer rates between host memory and GPU memory will be reduced. If the card is not installed in a compatible PCIe slot applications that dispatch a compute kernel will hang waiting for a completion signal from the GPU, which is an atomic operation.
 
-After booting the system with the new driver installed the dmesg output will indicate if there were any problems initializing the GPU. The output of the command ‘sudo dmesg | grep kfd’ will indicate if there were any initialization problems. A properly initialized system will have dmesg output similar to this
+After booting the system with the new driver installed the dmesg output will indicate if there were any problems initializing the GPU. The output of the command 'sudo dmesg | grep kfd' will indicate if there were any initialization problems. A properly initialized system will have dmesg output similar to this
 ::
  dmesg | grep kfd
  [    0.000000] Linux version 4.11.0-kfd-compute-roc-master-5051 (jenkins@jenkins-raptor-5) (gcc version 5.4.0 20160609 (Ubuntu 5.4.0-6ubuntu1~16.04.4) ) #1 SMP Thu Jun 29 21:00:37 CDT 2017
@@ -30,27 +30,27 @@ If the GPU is installed in a PCIe slot that is not supported there will be error
 Meta package Installation issues, rpm and dpkg
 ***********************************************
 
-The ROCm repository uses several “meta” packages that provide easy installation for several components of ROCm that do not have natural dependencies. The “meta” packages are empty debian or rpm files that have dependencies on several, unrelated, ROCm components. They are useful in installing or uninstalling the entire ROCm stack with one apt-get or dnf command, and also provide automatic configuration of the /dev/kfd file permissions using the udev service.
+The ROCm repository uses several "meta" packages that provide easy installation for several components of ROCm that do not have natural dependencies. The "meta" packages are empty debian or rpm files that have dependencies on several, unrelated, ROCm components. They are useful in installing or uninstalling the entire ROCm stack with one apt-get or dnf command, and also provide automatic configuration of the /dev/kfd file permissions using the udev service.
 
-In some cases users can “break” a ROCm installation by removing one of the “meta” packages using the rpm or dpkg command directly. The rpm and dpkg commands do not resolve dependencies like the dnf and apt-get commands do, and should not be used to remove any ‘meta’ packages, or any other ROCm package. For example, a user can remove the rocm package with the command ‘sudo dpkg –r rocm’ on Ubuntu, but that will not remove any of its dependencies. This is also true for the ‘sudo apt-get remove rocm’ command which will only remove the rocm ‘meta’ package and not its dependencies. To remove a ROCm installation completely, use ‘sudo apt-get autoremove rocm’ for Ubuntu and ‘sudo dnf remove rocm’ for Fedora.
+In some cases users can "break" a ROCm installation by removing one of the "meta" packages using the rpm or dpkg command directly. The rpm and dpkg commands do not resolve dependencies like the dnf and apt-get commands do, and should not be used to remove any 'meta' packages, or any other ROCm package. For example, a user can remove the rocm package with the command 'sudo dpkg -r rocm' on Ubuntu, but that will not remove any of its dependencies. This is also true for the 'sudo apt-get remove rocm' command which will only remove the rocm 'meta' package and not its dependencies. To remove a ROCm installation completely, use 'sudo apt-get autoremove rocm' for Ubuntu and 'sudo dnf remove rocm' for Fedora.
 
-The current meta packages are: rocm – Depends on the kernel drivers, firmware and the rocm-dev packages. rocm-dev – Depends on the roct, rocr, rocr extension, hcc and hip packages. rocm-libs – Depends on the hcBLAS, hcFFT, hcRNG, rocBLAS and hipBLAS packages.
+The current meta packages are: rocm - Depends on the kernel drivers, firmware and the rocm-dev packages. rocm-dev - Depends on the roct, rocr, rocr extension, hcc and hip packages. rocm-libs - Depends on the hcBLAS, hcFFT, hcRNG, rocBLAS and hipBLAS packages.
 
-If an installation has its ‘meta’ packages removed they can be reinstall using the standard apt-get or dnf command. Reinstall the ‘meta’ packages will not reinstall already installed dependencies
+If an installation has its 'meta' packages removed they can be reinstall using the standard apt-get or dnf command. Reinstall the 'meta' packages will not reinstall already installed dependencies
 
 Linux Kernels are not uninstalled by default
 **********************************************
 
-If ROCm is uninstalled using dnf or apt-get the kernel packages are not uninstalled by default. This is a Linux convention, and isn’t unique the ROCm stack. To remove the kernel packages, they will have to be removed explicitly:
+If ROCm is uninstalled using dnf or apt-get the kernel packages are not uninstalled by default. This is a Linux convention, and isn't unique the ROCm stack. To remove the kernel packages, they will have to be removed explicitly:
 
-For debian – ‘sudo apt-get autoremove ’ For RPM – ‘sudo dnf remove ’
+For debian - 'sudo apt-get autoremove ' For RPM - 'sudo dnf remove '
 
-The rpm or dpkg command can also be used, but isn’t recommended.
+The rpm or dpkg command can also be used, but isn't recommended.
 
 Updating firmware may not trigger a rebuilding of ramfs
 ********************************************************
 
-If a device isn’t detected by the ROCm kernel drivers, it is possible there is an issue loading required device firmware. This can happen if the system has downlevel firmware or if the firmware is updated, but the ramfs hasn’t been initialized with the new firmware images. To see if this is a problem, check the dmesg of the system:
+If a device isn't detected by the ROCm kernel drivers, it is possible there is an issue loading required device firmware. This can happen if the system has downlevel firmware or if the firmware is updated, but the ramfs hasn't been initialized with the new firmware images. To see if this is a problem, check the dmesg of the system:
 ::
  dmesg | grep amdgpu
  [    4.434129] [drm] amdgpu kernel modesetting enabled.
@@ -60,10 +60,10 @@ If a device isn’t detected by the ROCm kernel drivers, it is possible there is
  [    4.517733] amdgpu 0000:05:00.0: Fatal error during GPU init
  [    4.517757] [drm] amdgpu: finishing device.
  [    4.517914] amdgpu: probe of 0000:05:00.0 failed with error -2
- 
+
 The error displayed above indicates the kernel is having trouble loading the firmware.
 
-If the firmware version isn’t correct, please install updated firmware packages, which should be available on the repository server. If the correct firmware is installed, reinitialize the ramfs as follows:
+If the firmware version isn't correct, please install updated firmware packages, which should be available on the repository server. If the correct firmware is installed, reinitialize the ramfs as follows:
 
 **Ubuntu**
 ::
@@ -84,27 +84,27 @@ This problem can occur on Fedora installation if several previous kernels are cu
  -------------
  Disk Requirements:
     At least 17MB more space needed on the /boot filesystem.
- 
+
 
 This is not an issue with the YUM repository; it is caused by the size of the /boot filesystem and the size of the kernels already installed on it. This issue can be fixed by uninstalling previous versions of the rocm Linux kernel:
 ::
  sudo dnf remove rocm
- rpm -qa | grep kfd | xargs sudo rpm –e
+ rpm -qa | grep kfd | xargs sudo rpm -e
  sudo dnf install rocm
- 
+
 Installing from an archived repository
 **************************************
 
 The Radeon repo server stores several archived releases, supporting both debian and rpm repositories. These archives are located here at http://repo.radeon.com/rocm/archive. Users can install with an archive by downloading the desired archive and then updating the package configuration file to point at the localized repo.
 
 Debian Archive Example
-*********************** 
+***********************
 Here is an Example:
 ::
 
   cd /temp && wget http://repo.radeon.com/rocm/archive/apt_1.6.3.tar.bz2
   tar -xvf apt_1.6.3.tar.bz2
-  sudo echo “deb [amd64] file://temp/apt_1.6.3 xenial main” > /etc/apt/sources.lists.d/rocm.local.list
+  sudo echo "deb [amd64] file://temp/apt_1.6.3 xenial main" > /etc/apt/sources.lists.d/rocm.local.list
   sudo apt-get update && sudo apt-get install rocm
 
 Users should make sure that no other list files contain another rocm repo configuration.
@@ -119,7 +119,7 @@ Add a /etc/yum.d/rocm.local.repo file with the following contents: ::
   enabled=1
   gpgcheck=0
   cd /temp && wget http://repo.radeon.com/rocm/archive/yum_1.6.3.tar.bz2
-  tar –xvf yum_1.6.3.tar.bz2
+  tar -xvf yum_1.6.3.tar.bz2
 
 Then execute: ::
 
diff --git a/Installation_Guide/HCC-Compiler.rst b/Installation_Guide/HCC-Compiler.rst
index 8a350a6c..70336a10 100644
--- a/Installation_Guide/HCC-Compiler.rst
+++ b/Installation_Guide/HCC-Compiler.rst
@@ -173,4 +173,4 @@ For applications compiled using hcc, ThinLTO could significantly improve link-ti
 ThinLTO Phase 2 - Under development
 **************************************
 
-This ThinLTO implementation which will use llvm-lto LLVM tool to replace clamp-device bash script. It adds an optllc option into ThinLTOGenerator, which will perform in-program opt and codegen in parallel.
\ No newline at end of file
+This ThinLTO implementation which will use llvm-lto LLVM tool to replace clamp-device bash script. It adds an optllc option into ThinLTOGenerator, which will perform in-program opt and codegen in parallel.
diff --git a/Installation_Guide/HIP.rst b/Installation_Guide/HIP.rst
index 8b7affc9..1d5fc8ca 100644
--- a/Installation_Guide/HIP.rst
+++ b/Installation_Guide/HIP.rst
@@ -60,16 +60,16 @@ Programmers familiar with CUDA will also be able to quickly learn and start codi
 ::
  hipMalloc(&A_d, Nbytes));
  hipMalloc(&C_d, Nbytes));
- 
+
  hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice);
- 
+
  const unsigned blocks = 512;
  const unsigned threadsPerBlock = 256;
  hipLaunchKernelGGL(vector_square,   /* compute kernel*/
                  dim3(blocks), dim3(threadsPerBlock), 0/*dynamic shared*/, 0/*stream*/,     /* launch config*/
-                 C_d, A_d, N);  /* arguments to the compute kernel */ 
- 
- hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost); 
+                 C_d, A_d, N);  /* arguments to the compute kernel */
+
+ hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost);
 
 
 The HIP kernel language defines builtins for determining grid and block coordinates, math functions, short vectors, atomics, and timer functions. It also specifies additional defines and keywords for function types, address spaces, and optimization controls. (See the HIP Kernel Language for a full description). Here's an example of defining a simple 'vector_square' kernel.
@@ -80,7 +80,7 @@ The HIP kernel language defines builtins for determining grid and block coordina
  {
      size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
      size_t stride = hipBlockDim_x * hipGridDim_x ;
-  
+
     for (size_t i=offset; i<N; i+=stride) {
         C_d[i] = A_d[i] * A_d[i];
     }
@@ -105,7 +105,7 @@ Examples and Getting Started:
 
 * A sample and `blog <http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial>`_ that uses hipify to convert a simple app from CUDA to HIP:
 ::
- 
+
  cd samples/01_Intro/square
  # follow README / blog steps to hipify the application.
 
diff --git a/Installation_Guide/Installation-Guide.rst b/Installation_Guide/Installation-Guide.rst
index 94dd0bc8..ee782bf1 100644
--- a/Installation_Guide/Installation-Guide.rst
+++ b/Installation_Guide/Installation-Guide.rst
@@ -7,12 +7,12 @@ AMD ROCm QuickStart Installation Guide v3.3.0
 -  `Deploying ROCm`_
 
    -  `Ubuntu`_
-   
+
    -  `Centos RHEL v7.7`_
-   
+
    -  `SLES 15 Service Pack 1`_
-   
-   
+
+
 -  `ROCm Installation Known Issues and Workarounds`_
 
 
@@ -50,7 +50,7 @@ To install from a Debian Repository:
 
     sudo apt install libnuma-dev
 
-    sudo reboot 
+    sudo reboot
 
 2. Add the ROCm apt repository.
 
@@ -84,7 +84,7 @@ The current rocm.gpg.key is not available in a standard key ring distribution, b
 ::
 
      groups
-     
+
 
 5. To add your user to the video group, use the following command for the sudo password:
 
@@ -115,7 +115,7 @@ Note: To run the ROCm programs more efficiently, add the ROCm binaries in your P
 
 ::
 
-	echo 'export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64' | 
+	echo 'export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64' |
 	sudo tee -a /etc/profile.d/rocm.sh
 
 
@@ -151,9 +151,9 @@ You can install the ROCm user-level software without installing the AMD's custom
 
 ::
 
-  sudo apt update	
-  sudo apt install rocm-dev	
-  echo 'SUBSYSTEM=="kfd", KERNEL=="kfd", TAG+="uaccess", GROUP="video"' 
+  sudo apt update
+  sudo apt install rocm-dev
+  echo 'SUBSYSTEM=="kfd", KERNEL=="kfd", TAG+="uaccess", GROUP="video"'
   sudo tee /etc/udev/rules.d/70-kfd.rules
 
 
@@ -177,8 +177,8 @@ Note: The following steps do not apply to the CentOS installation.
 2. Enable the following repositories:
 
 ::
-   
-    sudo subscription-manager repos --enable rhel-server-rhscl-7-rpms 
+
+    sudo subscription-manager repos --enable rhel-server-rhscl-7-rpms
     sudo subscription-manager repos --enable rhel-7-server-optional-rpms
     sudo subscription-manager repos --enable rhel-7-server-extras-rpms
 
@@ -221,13 +221,13 @@ To install ROCm on your system, follow the instructions below:
 
 ::
 
-    [ROCm] 
+    [ROCm]
     name=ROCm
-    baseurl=http://repo.radeon.com/rocm/yum/rpm 
+    baseurl=http://repo.radeon.com/rocm/yum/rpm
     enabled=1
     gpgcheck=0
 
-Note: The URL of the repository must point to the location of the repositories’ repodata database.
+Note: The URL of the repository must point to the location of the repositories' repodata database.
 
 3. Install ROCm components using the following command:
 
@@ -325,7 +325,7 @@ You can install ROCm user-level software without installing AMD's custom ROCk ke
 ::
 
   sudo yum install rocm-dev
-  echo 'SUBSYSTEM=="kfd", KERNEL=="kfd", TAG+="uaccess", GROUP="video"' 
+  echo 'SUBSYSTEM=="kfd", KERNEL=="kfd", TAG+="uaccess", GROUP="video"'
   sudo tee /etc/udev/rules.d/70-kfd.rules
 
 **Note**: You can use this command instead of installing rocm-dkms.
@@ -336,7 +336,7 @@ You can install ROCm user-level software without installing AMD's custom ROCk ke
 SLES 15 Service Pack 1
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-The following section tells you how to perform an install and uninstall ROCm on SLES 15 SP 1. 
+The following section tells you how to perform an install and uninstall ROCm on SLES 15 SP 1.
 
 **Installation**
 
@@ -347,13 +347,13 @@ The following section tells you how to perform an install and uninstall ROCm on
 
 	sudo SUSEConnect --product PackageHub/15.1/x86_64
 	sudo zypper install dkms
-	
+
 2. Add the ROCm repo.
- 
+
 ::
 
-	sudo zypper clean –all
-	sudo zypper addrepo --no-gpgcheck http://repo.radeon.com/rocm/zyp/zypper/ rocm 
+	sudo zypper clean -all
+	sudo zypper addrepo --no-gpgcheck http://repo.radeon.com/rocm/zyp/zypper/ rocm
 	sudo zypper ref
 	zypper install rocm-dkms
 	sudo zypper install rocm-dkms
@@ -372,7 +372,7 @@ The following section tells you how to perform an install and uninstall ROCm on
 
 5. Run /opt/rocm/bin/rocminfo and /opt/rocm/opencl/bin/x86_64/clinfo commands to list the GPUs and verify that the ROCm installation is successful.
 
-6. Set permissions. 
+6. Set permissions.
 
 To access the GPU, you must be a user in the video group. Ensure your user account is a member of the video group prior to using ROCm. To identify the groups you are a member of, use the following command:
 
@@ -381,11 +381,11 @@ To access the GPU, you must be a user in the video group. Ensure your user accou
 	groups
 
 7. To add your user to the video group, use the following command for the sudo password:
-	
+
 ::
 
 	sudo usermod -a -G video $LOGNAME
-	
+
 8. By default, add any future users to the video group. Run the following command to add users to the video group:
 
 ::
@@ -428,9 +428,9 @@ Some users may want to install a subset of the full ROCm installation. If you ar
 ::
 
   sudo yum install rock-dkms rocm-opencl-devel
-  
 
-ROCm Installation Known Issues and Workarounds 
+
+ROCm Installation Known Issues and Workarounds
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Closed source components
@@ -438,7 +438,7 @@ Closed source components
 
 The ROCm platform relies on some closed source components to provide functionalities like HSA image support. These components are only available through the ROCm repositories, and they may be deprecated or become open source components in the future. These components are made available in the following packages:
 
-• hsa-ext-rocr-dev
+o hsa-ext-rocr-dev
 
 
 Getting the ROCm Source Code
@@ -449,7 +449,7 @@ AMD ROCm is built from open source software. It is, therefore, possible to modif
 Installing the Repo
 ^^^^^^^^^^^^^^^^^^^^^
 
-The repo tool from Google® allows you to manage multiple git repositories simultaneously. Run the following commands to install the repo:
+The repo tool from Google(R) allows you to manage multiple git repositories simultaneously. Run the following commands to install the repo:
 
 ::
 
@@ -515,37 +515,37 @@ ROCm Support Software
      -   ROCm cmake: ``rocm-cmake``
      -   rocminfo: ``rocminfo``
      -   ROCm Bandwidth Test: ``rocm_bandwidth_test``
-     
-    
+
+
 ROCm Development ToolChain
 ===========================
 
      -   HCC compiler: ``hcc``
-     
+
      -   HIP: ``hip_base``, ``hip_doc``, ``hip_hcc``, ``hip_samples``
-     
+
      -   ROCm Device Libraries: ``rocm-device-libs``
-     
+
      -   ROCm OpenCL: ``rocm-opencl``, ``rocm-opencl-devel`` (on RHEL/CentOS), ``rocm-opencl-dev`` (on Ubuntu)
-     
+
      -   ROCM Clang-OCL Kernel Compiler: ``rocm-clang-ocl``
-     
+
      -   Asynchronous Task and Memory Interface (ATMI): ``atmi``
-     
+
      -   ROCm Debug Agent: ``rocm_debug_agent``
-     
+
      -   ROCm Code Object Manager: ``comgr``
-     
+
      -   ROC Profiler: ``rocprofiler-dev``
-     
+
      -   ROC Tracer: ``roctracer-dev``
-     
+
      -   Radeon Compute Profiler: ``rocm-profiler``
-     
+
 
 ROCm Libraries
 ==============
- 
+
      -  rocALUTION: ``rocalution``
      -  rocBLAS: ``rocblas``
      -  hipBLAS: ``hipblas``
@@ -564,9 +564,9 @@ ROCm Libraries
 
 To make it easier to install ROCm, the AMD binary repositories provide a number of meta-packages that will automatically install multiple other packages. For example, ``rocm-dkms`` is the primary meta-package that is
 used to install most of the base technology needed for ROCm to operate.
-It will install the ``rock-dkms`` kernel driver, and another meta-package 
+It will install the ``rock-dkms`` kernel driver, and another meta-package
  (``rocm-dev``) which installs most of the user-land ROCm core components, support software, and development tools.
- 
+
 
 The *rocm-utils* meta-package will install useful utilities that, while not required for ROCm to operate, may still be beneficial to have. Finally, the *rocm-libs* meta-package will install some (but not all) of the libraries that are part of ROCm.
 
@@ -653,7 +653,7 @@ The latest supported version of the drivers, tools, libraries and source code fo
  -  `ROCm OpenCL Runtime`_
  -  `ROCm LLVM OCL`_
  -  `ROCm Device Libraries OCL`_
-         
+
  -  `ROCM Clang-OCL Kernel Compiler`_
  -  `Asynchronous Task and Memory Interface`_
  -  `ROCr Debug Agent`_
@@ -716,7 +716,7 @@ ROCm Development ToolChain
 ============================
 
 
-.. _HCC compiler: https://github.com/RadeonOpenCompute/hcc/tree/rocm-3.3.0 
+.. _HCC compiler: https://github.com/RadeonOpenCompute/hcc/tree/rocm-3.3.0
 
 .. _HIP: https://github.com/ROCm-Developer-Tools/HIP/tree/rocm-3.3.0
 
@@ -783,7 +783,7 @@ ROCm Libraries
 
 .. _MIVisionX: https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX/tree/1.7
 
-.. _AMDMIGraphX: https://github.com/ROCmSoftwarePlatform/AMDMIGraphX/commit/d1e945dabce0078d44c78de67b00232b856e18bc 
+.. _AMDMIGraphX: https://github.com/ROCmSoftwarePlatform/AMDMIGraphX/commit/d1e945dabce0078d44c78de67b00232b856e18bc
 
 
 
@@ -806,16 +806,16 @@ New features and enhancements in ROCm v3.1
 
 **Change in ROCm Installation Directory Structure**
 
-A fresh installation of the ROCm toolkit installs the packages in the /opt/rocm-<version> folder. 
+A fresh installation of the ROCm toolkit installs the packages in the /opt/rocm-<version> folder.
 Previously, ROCm toolkit packages were installed in the /opt/rocm folder.
 
 **Reliability, Accessibility, and Serviceability Support for Vega 7nm**
 
-The Reliability, Accessibility, and Serviceability (RAS) support for Vega7nm is now available. 
+The Reliability, Accessibility, and Serviceability (RAS) support for Vega7nm is now available.
 
 **SLURM Support for AMD GPU**
 
-SLURM (Simple Linux Utility for Resource Management) is an open source, fault-tolerant, and highly scalable cluster management and job scheduling system for large and small Linux clusters. 
+SLURM (Simple Linux Utility for Resource Management) is an open source, fault-tolerant, and highly scalable cluster management and job scheduling system for large and small Linux clusters.
 
 
 New features and enhancements in ROCm v3.0
@@ -836,11 +836,11 @@ The Fast Fourier Transform (FFT) is an efficient algorithm for computing the Dis
 
 Other improvements:
 
-• More 2D test coverage sizes.
+o More 2D test coverage sizes.
 
-• Fix buffer allocation error for large 1D transforms.
+o Fix buffer allocation error for large 1D transforms.
 
-• C++ compatibility improvements.
+o C++ compatibility improvements.
 
 MemCopy Enhancement for rocProf
 In the v3.0 release, the rocProf tool is enhanced with an additional capability to dump asynchronous GPU memcopy information into a .csv file. You can use the '-hsa-trace' option to create the results_mcopy.csv file. Future enhancements will include column labels.
@@ -856,7 +856,7 @@ In the AMD ROCm release v2.10, support is extended to the General Matrix Multipl
 
 Support for SLES 15 SP1
 
-In the AMD ROCm v2.10 release, support is added for SUSE Linux® Enterprise Server (SLES) 15 SP1. SLES is a modular operating system for both multimodal and traditional IT.
+In the AMD ROCm v2.10 release, support is added for SUSE Linux(R) Enterprise Server (SLES) 15 SP1. SLES is a modular operating system for both multimodal and traditional IT.
 
 Code Marker Support for rocProfiler and rocTracer Libraries
 
@@ -882,7 +882,7 @@ ROCm 2.9 adds support for Singularity container version 2.5.2.
 
 Initial release of rocTX
 
-ROCm 2.9 introduces rocTX, which provides a C API for code markup for performance profiling. This initial release of rocTX supports annotation of code ranges and ASCII markers. 
+ROCm 2.9 introduces rocTX, which provides a C API for code markup for performance profiling. This initial release of rocTX supports annotation of code ranges and ASCII markers.
 
 * Added support for Ubuntu 18.04.3
 * Ubuntu 18.04.3 is now supported in ROCm 2.9.
@@ -986,9 +986,9 @@ Bloat16 software support in rocBLAS/Tensile
 
 Added mixed precision bfloat16/IEEE f32 to gemm_ex. The input and output matrices are bfloat16. All arithmetic is in IEEE f32.
 
-AMD Infinity Fabric™ Link enablement
+AMD Infinity Fabric(TM) Link enablement
 
-The ability to connect four Radeon Instinct MI60 or Radeon Instinct MI50 boards in two hives or two Radeon Instinct MI60 or Radeon Instinct MI50 boards in four hives via AMD Infinity Fabric™ Link GPU interconnect technology has been added.
+The ability to connect four Radeon Instinct MI60 or Radeon Instinct MI50 boards in two hives or two Radeon Instinct MI60 or Radeon Instinct MI50 boards in four hives via AMD Infinity Fabric(TM) Link GPU interconnect technology has been added.
 
 ROCm-smi features and bug fixes
 
@@ -1008,7 +1008,7 @@ Improvements to *name_get functions
 
 RCCL2 Enablement
 
-RCCL2 supports collectives intranode communication using PCIe, Infinity Fabric™, and pinned host memory, as well as internode communication using Ethernet (TCP/IP sockets) and Infiniband/RoCE (Infiniband Verbs). Note: For Infiniband/RoCE, RDMA is not currently supported.
+RCCL2 supports collectives intranode communication using PCIe, Infinity Fabric(TM), and pinned host memory, as well as internode communication using Ethernet (TCP/IP sockets) and Infiniband/RoCE (Infiniband Verbs). Note: For Infiniband/RoCE, RDMA is not currently supported.
 
 rocFFT enhancements
 
@@ -1055,9 +1055,9 @@ Support overlapping kernel execution in same HIP stream
 
 HIP API has been enhanced to allow independent kernels to run in parallel on the same stream.
 
-AMD Infinity Fabric™ Link enablement
+AMD Infinity Fabric(TM) Link enablement
 
-The ability to connect four Radeon Instinct MI60 or Radeon Instinct MI50 boards in one hive via AMD Infinity Fabric™ Link GPU interconnect technology has been added.
+The ability to connect four Radeon Instinct MI60 or Radeon Instinct MI50 boards in one hive via AMD Infinity Fabric(TM) Link GPU interconnect technology has been added.
 
 New features and enhancements in ROCm 2.4
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1066,9 +1066,9 @@ TensorFlow 2.0 support
 
 ROCm 2.4 includes the enhanced compilation toolchain and a set of bug fixes to support TensorFlow 2.0 features natively
 
-AMD Infinity Fabric™ Link enablement
+AMD Infinity Fabric(TM) Link enablement
 
-ROCm 2.4 adds support to connect two Radeon Instinct MI60 or Radeon Instinct MI50 boards via AMD Infinity Fabric™ Link GPU interconnect technology.
+ROCm 2.4 adds support to connect two Radeon Instinct MI60 or Radeon Instinct MI50 boards via AMD Infinity Fabric(TM) Link GPU interconnect technology.
 
 New features and enhancements in ROCm 2.3
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1134,7 +1134,7 @@ Added support for multi-GPU training
 New features and enhancements in ROCm 2.1
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-RocTracer v1.0 preview release – 'rocprof' HSA runtime tracing and statistics support -
+RocTracer v1.0 preview release - 'rocprof' HSA runtime tracing and statistics support -
 Supports HSA API tracing and HSA asynchronous GPU activity including kernels execution and memory copy
 
 Improvements to ROCM-SMI tool -
@@ -1181,9 +1181,9 @@ Creates a stream with the specified priority. It creates a stream on which enque
 
 OpenCL 2.0 support
 
-ROCm 2.0 introduces full support for kernels written in the OpenCL 2.0 C language on certain devices and systems.  Applications can detect this support by calling the “clGetDeviceInfo” query function with “parame_name” argument set to “CL_DEVICE_OPENCL_C_VERSION”.  
+ROCm 2.0 introduces full support for kernels written in the OpenCL 2.0 C language on certain devices and systems.  Applications can detect this support by calling the "clGetDeviceInfo" query function with "parame_name" argument set to "CL_DEVICE_OPENCL_C_VERSION".
 
-In order to make use of OpenCL 2.0 C language features, the application must include the option “-cl-std=CL2.0” in options passed to the runtime API calls responsible for compiling or building device programs.  The complete specification for the OpenCL 2.0 C language can be obtained using the following link: https://www.khronos.org/registry/OpenCL/specs/opencl-2.0-openclc.pdf
+In order to make use of OpenCL 2.0 C language features, the application must include the option "-cl-std=CL2.0" in options passed to the runtime API calls responsible for compiling or building device programs.  The complete specification for the OpenCL 2.0 C language can be obtained using the following link: https://www.khronos.org/registry/OpenCL/specs/opencl-2.0-openclc.pdf
 
 Improved Virtual Addressing (48 bit VA) management for Vega 10 and later GPUs
 
@@ -1232,7 +1232,7 @@ Added DPM support to Vega 7nm
 
 Dynamic Power Management feature is enabled on Vega 7nm.
 
-Fix for 'ROCm profiling' that used to fail with a “Version mismatch between HSA runtime and libhsa-runtime-tools64.so.1” error
+Fix for 'ROCm profiling' that used to fail with a "Version mismatch between HSA runtime and libhsa-runtime-tools64.so.1" error
 
 New features and enhancements in ROCm 1.9.0
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1293,7 +1293,7 @@ IPC
 To try ROCm with an upstream kernel, install ROCm as normal, but do not install the rock-dkms package. Also add a udev rule to control /dev/kfd permissions:
 
     echo 'SUBSYSTEM=="kfd", KERNEL=="kfd", TAG+="uaccess", GROUP="video"' | sudo tee /etc/udev/rules.d/70-kfd.rules
-    
+
 New features as of ROCm 1.8.3
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -1353,4 +1353,4 @@ Binary Package support for Fedora 24 is not currently available
 Dropping binary package support for Ubuntu 14.04, Fedora 23
 
 IPC support
-                 
+
diff --git a/Installation_Guide/List-of-ROCm-Packages-for-Ubuntu-Fedora.rst b/Installation_Guide/List-of-ROCm-Packages-for-Ubuntu-Fedora.rst
index f9b7d756..3ca0f0a6 100644
--- a/Installation_Guide/List-of-ROCm-Packages-for-Ubuntu-Fedora.rst
+++ b/Installation_Guide/List-of-ROCm-Packages-for-Ubuntu-Fedora.rst
@@ -5,47 +5,47 @@ List of ROCm Packages for Ubuntu and Fedora
 ============================================
 
 +-----------------------------------+-----------------------+---------------------------------------------------------+
-|Package                            |  Debian 	            |   RPM						      |	
+|Package                            |  Debian 	            |   RPM						      |
 +===================================+=======================+=========================================================+
-|ROCm Master Package 	            |   rocm 	            |  rocm-1.6.77-Linux.rpm				      |	 
+|ROCm Master Package 	            |   rocm 	            |  rocm-1.6.77-Linux.rpm				      |
 +-----------------------------------+-----------------------+---------------------------------------------------------+
-|ROCm Developer Master Package 	    |   rocm-dev 	    |  rocm-dev-1.6.77-Linux.rpm  			      | 	
+|ROCm Developer Master Package 	    |   rocm-dev 	    |  rocm-dev-1.6.77-Linux.rpm  			      |
 +-----------------------------------+-----------------------+---------------------------------------------------------+
-|ROCm Libraries Master Package 	    |   rocm-libs 	    |  rocm-libs-1.6.77-Linux.rpm            		      |	
+|ROCm Libraries Master Package 	    |   rocm-libs 	    |  rocm-libs-1.6.77-Linux.rpm            		      |
 +-----------------------------------+-----------------------+---------------------------------------------------------+
-|ATMI       	                    |   atmi     	    |  atmi-0.3.7-45-gde867f2-Linux.rpm			      | 
+|ATMI       	                    |   atmi     	    |  atmi-0.3.7-45-gde867f2-Linux.rpm			      |
 +-----------------------------------+-----------------------+---------------------------------------------------------+
-|HCC   				    |   hcc	            |  hcc-1.0.17262-Linux.rpm  			      |	
+|HCC   				    |   hcc	            |  hcc-1.0.17262-Linux.rpm  			      |
 +-----------------------------------+-----------------------+---------------------------------------------------------+
-|hcBLAS 			    |   hcblas 	            |  hcblas-master-482646f-Linux.rpm			      |	
+|hcBLAS 			    |   hcblas 	            |  hcblas-master-482646f-Linux.rpm			      |
 +-----------------------------------+-----------------------+---------------------------------------------------------+
-|hcFFT 	                            |   hcfft. 	            |  hcfft-master-1a96022-Linux.rpm			      |	
+|hcFFT 	                            |   hcfft. 	            |  hcfft-master-1a96022-Linux.rpm			      |
 +-----------------------------------+-----------------------+---------------------------------------------------------+
-|hcRNG 	                            |   hcrng. 	            |  hcrng-master-c2ada99-Linux.rpm			      |	
+|hcRNG 	                            |   hcrng. 	            |  hcrng-master-c2ada99-Linux.rpm			      |
 +-----------------------------------+-----------------------+---------------------------------------------------------+
-|HIP Core 	                    |   hip_base 	    |  hip_base-1.2.17263.rpm				      |	
+|HIP Core 	                    |   hip_base 	    |  hip_base-1.2.17263.rpm				      |
 +-----------------------------------+-----------------------+---------------------------------------------------------+
 |HIP Documents 			    |   hip_doc 	    |  hip_doc-1.2.17263.rpm				      |
 +-----------------------------------+-----------------------+---------------------------------------------------------+
 |HIP Compiler 			    |   hip_hcc 	    |  hip_hcc-1.2.17263.rpm				      |
 +-----------------------------------+-----------------------+---------------------------------------------------------+
-|HIP Samples 			    |   hip_samples 	    |  hip_samples-1.2.17263.rpm.			      |	
+|HIP Samples 			    |   hip_samples 	    |  hip_samples-1.2.17263.rpm.			      |
 +-----------------------------------+-----------------------+---------------------------------------------------------+
 |HIPBLAS 			    |   hipblas 	    |  hipblas-0.4.0.3-Linux.rpm			      |
 +-----------------------------------+-----------------------+---------------------------------------------------------+
-|MIOpen OpenCL Lib 		    |   miopen-opencl. 	    |  MIOpen-OpenCL-1.0.0-Linux.rpm			      |	
+|MIOpen OpenCL Lib 		    |   miopen-opencl. 	    |  MIOpen-OpenCL-1.0.0-Linux.rpm			      |
++-----------------------------------+-----------------------+---------------------------------------------------------+
+|rocBLAS 	                    |   rocblas 	    |  rocblas-0.4.2.3-Linux.rpm      		              |
 +-----------------------------------+-----------------------+---------------------------------------------------------+
-|rocBLAS 	                    |   rocblas 	    |  rocblas-0.4.2.3-Linux.rpm      		              |		 
-+-----------------------------------+-----------------------+---------------------------------------------------------+ 
 |rocFFT 	                    |   rocfft 	            |  rocm-device-libs-0.0.1-Linux.rpm			      |
-+-----------------------------------+-----------------------+---------------------------------------------------------+        
-|ROCm Device Libs 		    |   rocm-device-libs    |  rocm-device-libs-0.0.1-Linux.rpm			      |	
 +-----------------------------------+-----------------------+---------------------------------------------------------+
-|ROCm OpenCL for Dev with CL headers|    rocm-opencl-dev    |  rocm-opencl-devel-1.2.0-1424893.x86_64.rpm	      |	
+|ROCm Device Libs 		    |   rocm-device-libs    |  rocm-device-libs-0.0.1-Linux.rpm			      |
++-----------------------------------+-----------------------+---------------------------------------------------------+
+|ROCm OpenCL for Dev with CL headers|    rocm-opencl-dev    |  rocm-opencl-devel-1.2.0-1424893.x86_64.rpm	      |
 +-----------------------------------+-----------------------+---------------------------------------------------------+
-|ROCm GDB 	                    |   rocm-gdb 	    |  rocm-gdb-1.5.265-gc4fb045.x86_64.rpm     	      |	
+|ROCm GDB 	                    |   rocm-gdb 	    |  rocm-gdb-1.5.265-gc4fb045.x86_64.rpm     	      |
 +-----------------------------------+-----------------------+---------------------------------------------------------+
-|RCP profiler 	                    |   rocm-profiler 	    | rocm-profiler-5.1.6386-gbaddcc9.x86_64.rpm	      |	
+|RCP profiler 	                    |   rocm-profiler 	    | rocm-profiler-5.1.6386-gbaddcc9.x86_64.rpm	      |
 +-----------------------------------+-----------------------+---------------------------------------------------------+
 |ROCm SMI Tool 	                    |   rocm-smi 	    |  rocm-smi-1.0.0_24_g68893bc-1.x86_64.rpm  	      |
 +-----------------------------------+-----------------------+---------------------------------------------------------+
diff --git a/Installation_Guide/More-about-how-ROCm-uses-PCIe-Atomics.rst b/Installation_Guide/More-about-how-ROCm-uses-PCIe-Atomics.rst
index d31929d3..768b5c8d 100644
--- a/Installation_Guide/More-about-how-ROCm-uses-PCIe-Atomics.rst
+++ b/Installation_Guide/More-about-how-ROCm-uses-PCIe-Atomics.rst
@@ -20,15 +20,15 @@ I/O device which support 32-bit, 64-bit and 128-bit operand which target address
 
 For ROCm the Platform atomics are used in ROCm in the following ways:
 
-   * Update HSA queue’s read_dispatch_id: 64 bit atomic add used by the command processor on the GPU agent to update the packet ID it 	  processed.
-   * Update HSA queue’s write_dispatch_id: 64 bit atomic add used by the CPU and GPU agent to support multi-writer queue insertions.
-   * Update HSA Signals – 64bit atomic ops are used for CPU & GPU synchronization.
+   * Update HSA queue's read_dispatch_id: 64 bit atomic add used by the command processor on the GPU agent to update the packet ID it 	  processed.
+   * Update HSA queue's write_dispatch_id: 64 bit atomic add used by the CPU and GPU agent to support multi-writer queue insertions.
+   * Update HSA Signals - 64bit atomic ops are used for CPU & GPU synchronization.
 
 The PCIe 3.0 AtomicOp feature allows atomic transactions to be requested by, routed through and completed by PCIe components. Routing and completion does not require software support. Component support for each is detectable via the DEVCAP2 register. Upstream bridges need to have AtomicOp routing enabled or the Atomic Operations will fall even though PCIe endpoint and PCIe I/O Devices has the capability to Atomics Operations.
 
 To do AtomicOp routing capability between two or more Root Ports, each associated Root Port must indicate that capability via the AtomicOp Routing Supported bit in the Device Capabilities 2 register.
 
-If your system has a PCIe Express Switch it needs to support AtomicsOp routing. Again AtomicOp requests are permitted only if a component’s DEVCTL2.ATOMICOP_REQUESTER_ENABLE field is set. These requests can only be serviced if the upstream components support AtomicOp completion and/or routing to a component which does. AtomicOp Routing Support=1 Routing is supported, AtomicOp Routing Support=0 routing is not supported.
+If your system has a PCIe Express Switch it needs to support AtomicsOp routing. Again AtomicOp requests are permitted only if a component's DEVCTL2.ATOMICOP_REQUESTER_ENABLE field is set. These requests can only be serviced if the upstream components support AtomicOp completion and/or routing to a component which does. AtomicOp Routing Support=1 Routing is supported, AtomicOp Routing Support=0 routing is not supported.
 
 Atomic Operation is a Non-Posted transaction supporting 32-bit and 64-bit address formats, there must be a response for Completion containing the result of the operation. Errors associated with the operation (uncorrectable error accessing the target location or carrying out the Atomic operation) are signaled to the requester by setting the Completion Status field in the completion descriptor, they are set to to Completer Abort (CA) or Unsupported Request (UR).
 
@@ -54,15 +54,15 @@ Future bus technology with richer I/O Atomics Operation Support
 
   * `GenZ <http://genzconsortium.org/faq/gen-z-technology/#33/>`_
 
-New PCIe Endpoints with support beyond AMD Ryzen and EPYC CPU; Intel Haswell or newer CPU’s with PCIe Generation 3.0 support.
+New PCIe Endpoints with support beyond AMD Ryzen and EPYC CPU; Intel Haswell or newer CPU's with PCIe Generation 3.0 support.
 
   * `Mellanox Bluefield SOC <http://www.mellanox.com/related-docs/npu-multicore-processors/PB_Bluefield_SoC.pdf>`_
   * `Cavium Thunder X2 <http://www.cavium.com/ThunderX2_ARM_Processors.html>`_
 
-In ROCm, we also take advantage of PCIe ID based ordering technology for P2P when the GPU originates two writes to two different targets:  
+In ROCm, we also take advantage of PCIe ID based ordering technology for P2P when the GPU originates two writes to two different targets:
 
   | 1. write to another GPU memory,
-  
+
   | 2. then write to system memory to indicate transfer complete.
 
 They are routed off to different ends of the computer but we want to make sure the write to system memory to indicate transfer complete occurs AFTER P2P write to GPU has complete.
@@ -76,7 +76,7 @@ On a Xeon E5 based system in the BIOS we can turn on above 4GB PCIe addressing,
 In SuperMicro system in the system bios you need to see the following
 
    * Advanced->PCIe/PCI/PnP configuration-> Above 4G Decoding = Enabled
-  
+
    * Advanced->PCIe/PCI/PnP Configuration->MMIOH Base = 512G
 
    * Advanced->PCIe/PCI/PnP Configuration->MMIO High Size = 256G
@@ -90,57 +90,57 @@ For GFX9 and Vega10 which have Physical Address up 44 bit and 48 bit Virtual add
    * BAR4 register: Optional, not a boot device.
    * BAR5 register: 32bit, non-prefetchable, MMIO. Must be placed < 4GB.
 
-Here is how our BAR works on GFX 8 GPU’s with 40 bit Physical Address Limit ::
+Here is how our BAR works on GFX 8 GPU's with 40 bit Physical Address Limit ::
 
   11:00.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Fiji [Radeon R9 FURY / NANO Series] (rev c1)
 
   Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0b35
-    
+
   Flags: bus master, fast devsel, latency 0, IRQ 119
-    
+
   Memory at bf40000000 (64-bit, prefetchable) [size=256M]
-   
+
   Memory at bf50000000 (64-bit, prefetchable) [size=2M]
-   
+
   I/O ports at 3000 [size=256]
-   
+
   Memory at c7400000 (32-bit, non-prefetchable) [size=256K]
-   
+
   Expansion ROM at c7440000 [disabled] [size=128K]
 
 Legend:
 
-1 : GPU Frame Buffer BAR – In this example it happens to be 256M, but typically this will be size of the GPU memory (typically 4GB+). This BAR has to be placed < 2^40 to allow peer-to-peer access from other GFX8 AMD GPUs. For GFX9 (Vega GPU) the BAR has to be placed < 2^44 to allow peer-to-peer access from other GFX9 AMD GPUs.
+1 : GPU Frame Buffer BAR - In this example it happens to be 256M, but typically this will be size of the GPU memory (typically 4GB+). This BAR has to be placed < 2^40 to allow peer-to-peer access from other GFX8 AMD GPUs. For GFX9 (Vega GPU) the BAR has to be placed < 2^44 to allow peer-to-peer access from other GFX9 AMD GPUs.
 
-2 : Doorbell BAR – The size of the BAR is typically will be < 10MB (currently fixed at 2MB) for this generation GPUs. This BAR has to be placed < 2^40 to allow peer-to-peer access from other current generation AMD GPUs.
+2 : Doorbell BAR - The size of the BAR is typically will be < 10MB (currently fixed at 2MB) for this generation GPUs. This BAR has to be placed < 2^40 to allow peer-to-peer access from other current generation AMD GPUs.
 
 3 : IO BAR - This is for legacy VGA and boot device support, but since this the GPUs in this project are not VGA devices (headless), this is not a concern even if the SBIOS does not setup.
 
-4 : MMIO BAR – This is required for the AMD Driver SW to access the configuration registers. Since the reminder of the BAR available is only 1 DWORD (32bit), this is placed < 4GB. This is fixed at 256KB.
+4 : MMIO BAR - This is required for the AMD Driver SW to access the configuration registers. Since the reminder of the BAR available is only 1 DWORD (32bit), this is placed < 4GB. This is fixed at 256KB.
 
-5 : Expansion ROM – This is required for the AMD Driver SW to access the GPU’s video-bios. This is currently fixed at 128KB.
+5 : Expansion ROM - This is required for the AMD Driver SW to access the GPU's video-bios. This is currently fixed at 128KB.
 
 Excepts form Overview of Changes to PCI Express 3.0
 ===================================================
 By Mike Jackson, Senior Staff Architect, MindShare, Inc.
 ********************************************************
-Atomic Operations – Goal:
+Atomic Operations - Goal:
 *************************
 Support SMP-type operations across a PCIe network to allow for things like offloading tasks between CPU cores and accelerators like a GPU. The spec says this enables advanced synchronization mechanisms that are particularly useful with multiple producers or consumers that need to be synchronized in a non-blocking fashion. Three new atomic non-posted requests were added, plus the corresponding completion (the address must be naturally aligned with the operand size or the TLP is malformed):
 
-  * Fetch and Add – uses one operand as the “add” value. Reads the target location, adds the operand, and then writes the result back 	  to the original location.
+  * Fetch and Add - uses one operand as the "add" value. Reads the target location, adds the operand, and then writes the result back 	  to the original location.
 
-  * Unconditional Swap – uses one operand as the “swap” value. Reads the target location and then writes the swap value to it.
+  * Unconditional Swap - uses one operand as the "swap" value. Reads the target location and then writes the swap value to it.
 
-  * Compare and Swap – uses 2 operands: first data is compare value, second is swap value. Reads the target location, checks it     	against the compare value and, if equal, writes the swap value to the target location.
+  * Compare and Swap - uses 2 operands: first data is compare value, second is swap value. Reads the target location, checks it     	against the compare value and, if equal, writes the swap value to the target location.
 
-  * AtomicOpCompletion – new completion to give the result so far atomic request and indicate that the atomicity of the transaction 	has been maintained.
+  * AtomicOpCompletion - new completion to give the result so far atomic request and indicate that the atomicity of the transaction 	has been maintained.
 
-Since AtomicOps are not locked they don't have the performance downsides of the PCI locked protocol. Compared to locked cycles, they provide “lower latency, higher scalability, advanced synchronization algorithms, and dramatically lower impact on other PCIe traffic.” The lock mechanism can still be used across a bridge to PCI or PCI-X to achieve the desired operation.
+Since AtomicOps are not locked they don't have the performance downsides of the PCI locked protocol. Compared to locked cycles, they provide "lower latency, higher scalability, advanced synchronization algorithms, and dramatically lower impact on other PCIe traffic." The lock mechanism can still be used across a bridge to PCI or PCI-X to achieve the desired operation.
 
 AtomicOps can go from device to device, device to host, or host to device. Each completer indicates whether it supports this capability and guarantees atomic access if it does. The ability to route AtomicOps is also indicated in the registers for a given port.
 
-ID-based Ordering – Goal:
+ID-based Ordering - Goal:
 *************************
 Improve performance by avoiding stalls caused by ordering rules. For example, posted writes are never normally allowed to pass each other in a queue, but if they are requested by different functions, we can have some confidence that the requests are not dependent on each other. The previously reserved Attribute bit [2] is now combined with the RO bit to indicate ID ordering with or without relaxed ordering.
 
diff --git a/Installation_Guide/Quick Start Installation Guide.rst b/Installation_Guide/Quick Start Installation Guide.rst
index de5109eb..7763e9dc 100644
--- a/Installation_Guide/Quick Start Installation Guide.rst	
+++ b/Installation_Guide/Quick Start Installation Guide.rst	
@@ -12,7 +12,7 @@ AMD ROCm QuickStart Installation Guide v3.1.0
    -  `SLES 15 Service Pack 1`_
 
 -  `ROCm Installation Known Issues and Workarounds`_
-   
+
 -  `Getting the ROCm Source Code`_
 
 |
@@ -53,7 +53,7 @@ To install from a Debian Repository:
 
     sudo apt install libnuma-dev
 
-    sudo reboot 
+    sudo reboot
 
 2. Add the ROCm apt repository.
 
@@ -87,7 +87,7 @@ The current rocm.gpg.key is not available in a standard key ring distribution, b
 ::
 
      groups
-     
+
 
 5. To add your user to the video group, use the following command for the sudo password:
 
@@ -99,7 +99,7 @@ The current rocm.gpg.key is not available in a standard key ring distribution, b
 
 ::
 
-     echo 'ADD_EXTRA_GROUPS=1' 
+     echo 'ADD_EXTRA_GROUPS=1'
      sudo tee -a /etc/adduser.conf
 
      echo 'EXTRA_GROUPS=video'
@@ -122,7 +122,7 @@ Note: To run the ROCm programs more efficiently, add the ROCm binaries in your P
 
 ::
 
-	echo 'export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64' | 
+	echo 'export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64' |
 	sudo tee -a /etc/profile.d/rocm.sh
 
 
@@ -158,9 +158,9 @@ You can install the ROCm user-level software without installing the AMD's custom
 
 ::
 
-  sudo apt update	
-  sudo apt install rocm-dev	
-  echo 'SUBSYSTEM=="kfd", KERNEL=="kfd", TAG+="uaccess", GROUP="video"' 
+  sudo apt update
+  sudo apt install rocm-dev
+  echo 'SUBSYSTEM=="kfd", KERNEL=="kfd", TAG+="uaccess", GROUP="video"'
   sudo tee /etc/udev/rules.d/70-kfd.rules
 
 
@@ -186,8 +186,8 @@ Note: The following steps do not apply to the CentOS installation.
 2. Enable the following repositories:
 
 ::
-   
-    sudo subscription-manager repos --enable rhel-server-rhscl-7-rpms 
+
+    sudo subscription-manager repos --enable rhel-server-rhscl-7-rpms
     sudo subscription-manager repos --enable rhel-7-server-optional-rpms
     sudo subscription-manager repos --enable rhel-7-server-extras-rpms
 
@@ -230,13 +230,13 @@ To install ROCm on your system, follow the instructions below:
 
 ::
 
-    [ROCm] 
+    [ROCm]
     name=ROCm
-    baseurl=http://repo.radeon.com/rocm/yum/rpm 
+    baseurl=http://repo.radeon.com/rocm/yum/rpm
     enabled=1
     gpgcheck=0
 
-Note: The URL of the repository must point to the location of the repositories’ repodata database.
+Note: The URL of the repository must point to the location of the repositories' repodata database.
 
 3. Install ROCm components using the following command:
 
@@ -336,7 +336,7 @@ You can install ROCm user-level software without installing AMD's custom ROCk ke
 ::
 
   sudo yum install rocm-dev
-  echo 'SUBSYSTEM=="kfd", KERNEL=="kfd", TAG+="uaccess", GROUP="video"' 
+  echo 'SUBSYSTEM=="kfd", KERNEL=="kfd", TAG+="uaccess", GROUP="video"'
   sudo tee /etc/udev/rules.d/70-kfd.rules
 
 **Note**: You can use this command instead of installing rocm-dkms.
@@ -347,7 +347,7 @@ You can install ROCm user-level software without installing AMD's custom ROCk ke
 SLES 15 Service Pack 1
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-The following section tells you how to perform an install and uninstall ROCm on SLES 15 SP 1. 
+The following section tells you how to perform an install and uninstall ROCm on SLES 15 SP 1.
 
 **Installation**
 
@@ -358,13 +358,13 @@ The following section tells you how to perform an install and uninstall ROCm on
 
 	sudo SUSEConnect --product PackageHub/15.1/x86_64
 	sudo zypper install dkms
-	
+
 2. Add the ROCm repo.
- 
+
 ::
 
-	sudo zypper clean –all
-	sudo zypper addrepo --no-gpgcheck http://repo.radeon.com/rocm/zyp/zypper/ rocm 
+	sudo zypper clean -all
+	sudo zypper addrepo --no-gpgcheck http://repo.radeon.com/rocm/zyp/zypper/ rocm
 	sudo zypper ref
 	zypper install rocm-dkms
 	sudo zypper install rocm-dkms
@@ -383,7 +383,7 @@ The following section tells you how to perform an install and uninstall ROCm on
 
 5. Run /opt/rocm/bin/rocminfo and /opt/rocm/opencl/bin/x86_64/clinfo commands to list the GPUs and verify that the ROCm installation is successful.
 
-6. Set permissions. 
+6. Set permissions.
 
 To access the GPU, you must be a user in the video group. Ensure your user account is a member of the video group prior to using 	 ROCm. To identify the groups you are a member of, use the following command:
 
@@ -392,11 +392,11 @@ To access the GPU, you must be a user in the video group. Ensure your user accou
 	groups
 
 7. To add your user to the video group, use the following command for the sudo password:
-	
+
 ::
 
 	sudo usermod -a -G video $LOGNAME
-	
+
 8. By default, add any future users to the video group. Run the following command to add users to the video group:
 
 ::
@@ -414,7 +414,7 @@ To access the GPU, you must be a user in the video group. Ensure your user accou
 	/opt/rocm/opencl/bin/x86_64/clinfo
 
 Note: To run the ROCm programs more efficiently, add the ROCm binaries in your PATH.
-echo 'export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64' | 
+echo 'export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64' |
 
 ::
 
@@ -439,9 +439,9 @@ Some users may want to install a subset of the full ROCm installation. If you ar
 ::
 
   sudo yum install rock-dkms rocm-opencl-devel
-  
 
-ROCm Installation Known Issues and Workarounds 
+
+ROCm Installation Known Issues and Workarounds
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Closed source components
@@ -449,7 +449,7 @@ Closed source components
 
 The ROCm platform relies on some closed source components to provide functionalities like HSA image support. These components are only available through the ROCm repositories, and they may be deprecated or become open source components in the future. These components are made available in the following packages:
 
-• hsa-ext-rocr-dev
+o hsa-ext-rocr-dev
 
 
 Getting the ROCm Source Code
@@ -460,7 +460,7 @@ AMD ROCm is built from open source software. It is, therefore, possible to modif
 Installing the Repo
 ^^^^^^^^^^^^^^^^^^^^^
 
-The repo tool from Google® allows you to manage multiple git repositories simultaneously. Run the following commands to install the repo:
+The repo tool from Google(R) allows you to manage multiple git repositories simultaneously. Run the following commands to install the repo:
 
 ::
 
diff --git a/Installation_Guide/QuickStartGuideOpenCL.rst b/Installation_Guide/QuickStartGuideOpenCL.rst
index 45c77f25..3a8390be 100644
--- a/Installation_Guide/QuickStartGuideOpenCL.rst
+++ b/Installation_Guide/QuickStartGuideOpenCL.rst
@@ -3,14 +3,14 @@
 Quick Start Guide For OpenCL
 ============================
 
-* ROCm 1.7 introduces big updates to our OpenCL compiler and runtime implementation -- built on top of the ROCm software stack! 
+* ROCm 1.7 introduces big updates to our OpenCL compiler and runtime implementation -- built on top of the ROCm software stack!
 
 This developer release includes the following:
 ------------------------------
 
 * OpenCL 2.0 compatible kernel language support with OpenCL 1.2 compatible runtime
-* OpenCL compiler also has assembler and disassembler support,  inline assembly support is now in place. 
-* Big improvements in the base compiler as we roll in new optimization for application in new Native LLVM code generator. 
+* OpenCL compiler also has assembler and disassembler support,  inline assembly support is now in place.
+* Big improvements in the base compiler as we roll in new optimization for application in new Native LLVM code generator.
 * We made our base compiler intrinsics source code available
   * OCML https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/master/doc/OCML.md
   * Source code for the Intrinsic https://github.com/RadeonOpenCompute/ROCm-Device-Libs/tree/master/opencl/src
@@ -29,7 +29,7 @@ Install the ROCm OpenCL implementation (assuming you already have the 'rocm' pac
 
 For a sample OpenCL application, let's use a simple vector-add example from the University of Bristol's very nice "Hands On OpenCL" lectures.
 
-.. code-block:: 
+.. code-block::
 
 
  git clone https://github.com/HandsOnOpenCL/Exercises-Solutions.git
@@ -43,15 +43,15 @@ For a sample OpenCL application, let's use a simple vector-add example from the
  ./vadd
 
 
-Not for all your application that supported the AMDGPU SDK for OpenCL to get the Header,  rocm-opencl-dev now included the headerfiles. 
+Not for all your application that supported the AMDGPU SDK for OpenCL to get the Header,  rocm-opencl-dev now included the headerfiles.
 
-If your built all your code with the AMDAPPSDK you do not need to download anything else,  you can just export environment variable to  /opt/rocm/opencl    
+If your built all your code with the AMDAPPSDK you do not need to download anything else,  you can just export environment variable to  /opt/rocm/opencl
 
-Do not install the AMDAPPSDK 3.0  on ROCm OpenCL it designed for old driver which need headers installed.  rocm-opencl-dev package does this for you. 
+Do not install the AMDAPPSDK 3.0  on ROCm OpenCL it designed for old driver which need headers installed.  rocm-opencl-dev package does this for you.
 
 Example 1 for AMDAPPSDKROOT
 ::
- export AMDAPPSDKROOT=/opt/rocm/opencl 
+ export AMDAPPSDKROOT=/opt/rocm/opencl
 
 
 Example 2 for AMDAPPSDK
@@ -61,10 +61,10 @@ Example 2 for AMDAPPSDK
 
 Where is clinfo?
 ::
- /opt/rocm/opencl/bin/x86_64/clinfo 
+ /opt/rocm/opencl/bin/x86_64/clinfo
 
 
-* That's it!  Super easy. 
+* That's it!  Super easy.
 
 Related Resources
 -----------------
diff --git a/Installation_Guide/ROC-smi.rst b/Installation_Guide/ROC-smi.rst
index d4a98db1..7949f456 100644
--- a/Installation_Guide/ROC-smi.rst
+++ b/Installation_Guide/ROC-smi.rst
@@ -26,16 +26,16 @@ For convenience purposes, following is a quick excerpt:
                  [--setsclk LEVEL [LEVEL ...]] [--setmclk LEVEL [LEVEL ...]] [--setfan LEVEL]
                  [--setperflevel LEVEL] [--setoverdrive %] [--setprofile # # # # #] [--resetprofile]
                  [--load FILE | --save FILE] [--autorespond RESPONSE]
-   
+
  AMD ROCm System Management Interface
-  
+
   optional arguments:
    -h, --help                  show this help message and exit
    --load FILE                 Load Clock, Fan, Performance and Profile settings from FILE
    --save FILE                 Save Clock, Fan, Performance and Profile settings to FILE
-   
-   -d DEVICE, --device DEVICE  Execute command on specified device 
-  
+
+   -d DEVICE, --device DEVICE  Execute command on specified device
+
    -i, --showid                Show GPU ID
    -t, --showtemp              Show current temperature
    -c, --showclocks            Show current clock frequencies
@@ -46,8 +46,8 @@ For convenience purposes, following is a quick excerpt:
    -o, --showoverdrive         Show current OverDrive level
    -l, --showprofile           Show Compute Profile attributes
    -s, --showclkfrq            Show supported GPU and Memory Clock
-   -a, --showallinfo           Show all SMI-supported values values 
-   
+   -a, --showallinfo           Show all SMI-supported values values
+
    -r, --resetclocks           Reset clocks to default (auto)
    --setsclk LEVEL [LEVEL ...] Set GPU Clock Frequency Level Mask (manual)
    --setmclk LEVEL [LEVEL ...] Set GPU Memory Clock Frequency Mask (manual)
@@ -55,8 +55,8 @@ For convenience purposes, following is a quick excerpt:
    --setperflevel LEVEL        Set PowerPlay Performance Level
    --setoverdrive %            Set GPU OverDrive level (manual|high)
    --setprofile # # # # #      Specify Compute Profile attributes (auto)
-   --resetprofile              Reset Compute Profile 
-   
+   --resetprofile              Reset Compute Profile
+
    --autorespond RESPONSE      Response to automatically provide for all prompts (NOT RECOMMENDED)
 
 
@@ -69,13 +69,13 @@ Detailed Option Descriptions
      The clock levels will change dynamically based on GPU load based on the default
      Compute and Graphics profiles. The thresholds and delays for a custom mask cannot
      be controlled through the SMI tool
-   
+
      This flag automatically sets the Performance Level to "manual" as the mask is not
      applied when the Performance level is set to auto
 
 
 --setfan LEVEL: This sets the fan speed to a value ranging from 0 to 255 (not from 0-100%).
-:: 
+::
  NOTE: While the hardware is usually capable of overriding this value when required, it is
        recommended to not set the fan level lower than the default value for extended periods
        of time
@@ -87,18 +87,18 @@ Detailed Option Descriptions
 ::
  NOTES:
      This option can be used in conjunction with the --setsclk mask
-    
+
      Operating the GPU outside of specifications can cause irreparable damage to your hardware
      Please observe the warning displayed when using this option
-   
+
      This flag automatically sets the sclk to the highest level, as only the highest level is
      increased by the OverDrive value
-    
+
 --setprofile # # # # #: The Compute Profile accepts 5 parameters, which are (in order): Minimum SCLK - Minimum GPU clock speed in MHz Minimum MCLK - Minimum GPU Memory clock speed in MHz Activity threshold - Workload required before clock levels change (%) Hysteresis Up - Delay before clock level is increased in milliseconds Hysteresis Down - Delay before clock level is decresed in milliseconds
 ::
  NOTES:
      When a compute queue is detected, these values will be automatically applied to the system
-   
+
      Compute Power Profiles are only applied when the Performance Level is set to "auto"
      so using this flag will automatically set the performance level to "auto"
 
@@ -115,7 +115,7 @@ Any new functionality added to the SMI should have a corresponding test added to
 
 GitHub
 ********
-For more information please refer `Github link <https://github.com/RadeonOpenCompute/ROC-smi/tree/roc-1.7.x>`_. 
+For more information please refer `Github link <https://github.com/RadeonOpenCompute/ROC-smi/tree/roc-1.7.x>`_.
 
 Disclaimer
 *************
diff --git a/Installation_Guide/ROCK-Kernel-Driver_readme.rst b/Installation_Guide/ROCK-Kernel-Driver_readme.rst
index ec80ede1..3b004d7f 100644
--- a/Installation_Guide/ROCK-Kernel-Driver_readme.rst
+++ b/Installation_Guide/ROCK-Kernel-Driver_readme.rst
@@ -61,7 +61,7 @@ LICENSE
 #########
 
 The following lists the different licenses that apply to the different components in this repository:
- 
+
 | The Linux kernel images are covered by the modified GPL license in COPYING
 | The firmware image is covered by the license in LICENSE.ucode
 
diff --git a/Installation_Guide/ROCR-Runtime.rst b/Installation_Guide/ROCR-Runtime.rst
index 9427b56a..ffe04e6b 100644
--- a/Installation_Guide/ROCR-Runtime.rst
+++ b/Installation_Guide/ROCR-Runtime.rst
@@ -13,7 +13,7 @@ Initial target platform requirements
  * CPU: Intel Haswell or newer, Core i5, Core i7, Xeon E3 v4 & v5; Xeon E5 v3
  * GPU: Fiji ASIC (AMD R9 Nano, R9 Fury and R9 Fury X)
  * GPU: Polaris ASIC (AMD RX480)
- 
+
 Source code
 **************
 The HSA core runtime source code for the ROCR runtime is located in the src subdirectory. Please consult the associated README.md file for contents and build instructions.
diff --git a/Installation_Guide/ROCk-kernel.rst b/Installation_Guide/ROCk-kernel.rst
index b7d986c0..0a3bf4a9 100644
--- a/Installation_Guide/ROCk-kernel.rst
+++ b/Installation_Guide/ROCk-kernel.rst
@@ -6,11 +6,11 @@ ROCk-Kernel
 
 The following is a sequence of commands to Install ROCk-Kernel into the system:
 
-**# OPTIONAL :** 
+**# OPTIONAL :**
 upgrade your base kernel to 4.13.0-32-generic, **reboot required**
 ::
  sudo apt update && sudo apt install linux-headers-4.13.0-32-generic linux-image-4.13.0-32-generic linux-image-extra-4.13.0-32-generic linux-signed-image-4.13.0-32-generic
- sudo reboot 
+ sudo reboot
 
 Installation steps:
 ###################
@@ -27,7 +27,7 @@ Install the ROCm compute firmware and rock-dkms kernel modules, **reboot require
 ::
  sudo adduser $LOGNAME video
 
-Make sure to reboot the machine after installing the ROCm kernel package to force the new kernel to load on reboot. 
+Make sure to reboot the machine after installing the ROCm kernel package to force the new kernel to load on reboot.
 
 You can verify the ROCm kernel is loaded by typing the following command at a prompt:
 ::
@@ -38,5 +38,5 @@ Printed on the screen should be similar as follows:
  amdkfd                270336  4
  amd_iommu_v2           20480  1 amdkfd
  amdkcl                 24576  3 amdttm,amdgpu,amdkfd
- 
- 
+
+
diff --git a/Installation_Guide/atmi.rst b/Installation_Guide/atmi.rst
index 8fb8d621..bed873bc 100644
--- a/Installation_Guide/atmi.rst
+++ b/Installation_Guide/atmi.rst
@@ -1,12 +1,12 @@
 .. _Asynch:
 
 =====
-ATMI 
+ATMI
 =====
 
 ATMI (Asynchronous Task and Memory Interface) Asynchronous Task and Memory Interface, or ATMI, is a runtime framework and declarative programming model for heterogeneous CPU-GPU systems. It provides a consistent API to create task graphs on CPUs and GPUs (integrated and discrete). ATMI is a declarative programming model, where high-level tasks can be simply described by using a few predefined C-style structures. The task description includes specifying its granularity, dependencies to other tasks, data requirements and so on. The ATMI runtime, based on the task graph, will perform task scheduling and memory management that is optimal for the underlying platform. ATMI provides a rich and flexible user interface so that the end user can relinquish scheduling to the runtime (default behavior) or take full control of scheduling and mapping, if desired. The target audience for ATMI is application programmers or middleware developers for high-level languages.
 
-Compilation and Runtime Workflow 
+Compilation and Runtime Workflow
 ************************************
 
 The below figure depicts the ATMI runtime workflow with CLOC as the compiler utility.
@@ -39,12 +39,12 @@ ATMI v0.3
 * Devices supported: AMD Carrizo and Kaveri APUs, and AMD Fiji dGPU
 * Runtimes used: ROCm v1.2
 
-License 
+License
 *********
 
 MIT License
 
-Copyright © 2016 Advanced Micro Devices, Inc.
+Copyright (C) 2016 Advanced Micro Devices, Inc.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 
@@ -52,4 +52,4 @@ The above copyright notice and this permission notice shall be included in all c
 
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-Link to Github Repository `ATMI <https://github.com/RadeonOpenCompute/atmi/tree/0.3.7>`_ 
+Link to Github Repository `ATMI <https://github.com/RadeonOpenCompute/atmi/tree/0.3.7>`_
diff --git a/Other_Solutions/Other-Solutions.rst b/Other_Solutions/Other-Solutions.rst
index 89b2611b..d356c021 100644
--- a/Other_Solutions/Other-Solutions.rst
+++ b/Other_Solutions/Other-Solutions.rst
@@ -5,7 +5,7 @@
 System Level Debug
 =====================
 
-ROCm Language & System Level Debug, Flags and Environment Variables 
+ROCm Language & System Level Debug, Flags and Environment Variables
 #####################################################################
 
 | Kernel options to avoid Ethernet port getting renamed every time you change graphics cards
@@ -15,32 +15,32 @@ ROCr Error Code
 ******************
 
 * 2  Invalid Dimension
-* 4 Invalid Group Memory 
-* 8 Invalid (or Null) Code 
+* 4 Invalid Group Memory
+* 8 Invalid (or Null) Code
 * 32 Invalid Format </li>
-* 64 Group is too large 
-* 128 Out of VGPR’s 
-* 0x80000000  Debug Trap 
+* 64 Group is too large
+* 128 Out of VGPR's
+* 0x80000000  Debug Trap
 
-Command to dump firmware version and get Linux Kernel version 
+Command to dump firmware version and get Linux Kernel version
 *****************************************************************
 
-* sudo cat /sys/kernel/debug/dri/1/amdgpu_firmware_info 
-* uname -a  
+* sudo cat /sys/kernel/debug/dri/1/amdgpu_firmware_info
+* uname -a
 
-Debug Flags 
+Debug Flags
 ***************
 
 Debug messages when developing/debugging base ROCm dirver. You could enable the printing from libhsakmt.so by setting an environment variable, HSAKMT_DEBUG_LEVEL. Available debug levels are 3~7. The higher level you set, the more messages will print.
 
 * export HSAKMT_DEBUG_LEVEL=3 : only pr_err() will print.
 * export HSAKMT_DEBUG_LEVEL=4 : pr_err() and pr_warn() will print.
-* export HSAKMT_DEBUG_LEVEL=5 : We currently don’t implement “notice”. Setting to 5 is same as setting to 4.
+* export HSAKMT_DEBUG_LEVEL=5 : We currently don't implement "notice". Setting to 5 is same as setting to 4.
 * export HSAKMT_DEBUG_LEVEL=6 : pr_err(), pr_warn(), and pr_info will print.
 * export HSAKMT_DEBUG_LEVEL=7 : Everything including pr_debug will print.
 
 
-ROCr level env variable for debug 
+ROCr level env variable for debug
 ************************************
 
 * HSA_ENABLE_SDMA=0
@@ -51,9 +51,9 @@ ROCr level env variable for debug
 Turn Off Page Retry on GFX9/Vega devices
 **********************
 
-  * sudo –s
+  * sudo -s
   * echo 1 > /sys/module/amdkfd/parameters/noretry
-  
+
 
 
 HCC Debug Enviroment Varibles
@@ -165,7 +165,7 @@ PCIe-Debug
 
 Refer here for :ref:`PCIe-Debug`
 
-**There’s some more information here on how to debug and profile HIP applications**
+**There's some more information here on how to debug and profile HIP applications**
 
 * `HIP-Debugging <http://rocm-documentation.readthedocs.io/en/latest/Programming_Guides/HIP_Debugging.html#hip-debugging>`_
 * `HIP-Profiling <http://rocm-documentation.readthedocs.io/en/latest/Programming_Guides/hip_profiling.html#hip-profiling>`_
diff --git a/Other_Solutions/PCIe-Debug.rst b/Other_Solutions/PCIe-Debug.rst
index c1470cc3..2b8c9469 100644
--- a/Other_Solutions/PCIe-Debug.rst
+++ b/Other_Solutions/PCIe-Debug.rst
@@ -3,7 +3,7 @@
 ROCm PCIe Debug
 =================
 
-lspci helpfull options to help you debug ROCm install issue 
+lspci helpfull options to help you debug ROCm install issue
 **************************************************************
 
 **To find if the Linux Kerenl is seeing your GPU and to get the the slot number of the device part number you want to look at**
@@ -17,7 +17,7 @@ lspci helpfull options to help you debug ROCm install issue
   63:00.0 VGA compatible controller: Advanced Micro Devices, Inc. [AMD/ATI] Device 6860
 
 
-**Show Device Slot** 
+**Show Device Slot**
 
 lspci -s _slot number_
 
@@ -36,7 +36,7 @@ Example
 ::
 
   ~$ sudo lspci -vs 63:00.0
-  [sudo] password for rocm: 
+  [sudo] password for rocm:
   63:00.0 VGA compatible controller: Advanced Micro Devices, Inc. [AMD/ATI] Device 6860 (prog-if 00 [VGA controller])
   	 Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0c35
 	 Flags: bus master, fast devsel, latency 0, IRQ 412
@@ -62,7 +62,7 @@ Example
 	 Kernel modules: amdgpu
 
 
-**Display Vendor and Device Codes and numbers** 
+**Display Vendor and Device Codes and numbers**
 
 lspci -nvmms _slot number_
 
@@ -74,11 +74,11 @@ lspci -nvmms _slot number_
    Vendor:	1002
    Device:	6860
    SVendor:	1002
-   SDevice:	0c35 
+   SDevice:	0c35
+
+
+**To show kernel module running on device**
 
-  
-**To show kernel module running on device** 
- 
  lspci -ks _slot number_
 
 ::
@@ -89,11 +89,11 @@ lspci -nvmms _slot number_
 	Kernel driver in use: amdgpu
 	Kernel modules: amdgpu
 
-**When you need more information on the device** 
+**When you need more information on the device**
 
 sudo lspci -vvvs _slot number_
 
-Example 
+Example
 
 ::
 
@@ -158,9 +158,9 @@ Example
 	Kernel driver in use: amdgpu
 	Kernel modules: amdgpu
 
-  
+
 **To print PCIe root tree**
- 
+
 ::
 
    ~$ lspci -tv
diff --git a/Other_Solutions/ROCm_PCIe_Debug.md b/Other_Solutions/ROCm_PCIe_Debug.md
index 633fe34b..44a02adc 100644
--- a/Other_Solutions/ROCm_PCIe_Debug.md
+++ b/Other_Solutions/ROCm_PCIe_Debug.md
@@ -1,5 +1,5 @@
 
-lspci helpfull options to help you debug ROCm install issue 
+lspci helpfull options to help you debug ROCm install issue
 **************************************************************
 
 **To find if the Linux Kerenl is seeing your GPU and to get the the slot number of the device part number you want to look at**
@@ -12,7 +12,7 @@ lspci helpfull options to help you debug ROCm install issue
   43:00.0 VGA compatible controller: Advanced Micro Devices, Inc. [AMD/ATI] Device 6860
   63:00.0 VGA compatible controller: Advanced Micro Devices, Inc. [AMD/ATI] Device 6860
 ```
-**Show Device Slot** 
+**Show Device Slot**
 
 lspci -s _slot number_
 
@@ -31,7 +31,7 @@ Example
 ::
 
   ~$ sudo lspci -vs 63:00.0
-  [sudo] password for rocm: 
+  [sudo] password for rocm:
   63:00.0 VGA compatible controller: Advanced Micro Devices, Inc. [AMD/ATI] Device 6860 (prog-if 00 [VGA controller])
   	 Subsystem: Advanced Micro Devices, Inc. [AMD/ATI] Device 0c35
 	 Flags: bus master, fast devsel, latency 0, IRQ 412
@@ -57,7 +57,7 @@ Example
 	 Kernel modules: amdgpu
 
 
-**Display Vendor and Device Codes and numbers** 
+**Display Vendor and Device Codes and numbers**
 
 lspci -nvmms _slot number_
 
@@ -69,11 +69,11 @@ lspci -nvmms _slot number_
    Vendor:	1002
    Device:	6860
    SVendor:	1002
-   SDevice:	0c35 
+   SDevice:	0c35
+
+
+**To show kernel module running on device**
 
-  
-**To show kernel module running on device** 
- 
  lspci -ks _slot number_
 
 ::
@@ -84,11 +84,11 @@ lspci -nvmms _slot number_
 	Kernel driver in use: amdgpu
 	Kernel modules: amdgpu
 
-**When you need more information on the device** 
+**When you need more information on the device**
 
 sudo lspci -vvvs _slot number_
 
-Example 
+Example
 
 ::
 
@@ -153,9 +153,9 @@ Example
 	Kernel driver in use: amdgpu
 	Kernel modules: amdgpu
 
-  
+
 **To print PCIe root tree**
- 
+
 ::
 
    ~$ lspci -tv
diff --git a/Programming_Guides/CUDAAPIHIP.rst b/Programming_Guides/CUDAAPIHIP.rst
index 88309746..926440ed 100644
--- a/Programming_Guides/CUDAAPIHIP.rst
+++ b/Programming_Guides/CUDAAPIHIP.rst
@@ -849,7 +849,7 @@ CUDA Driver API functions supported by HIP
 +----------------------+-----+
 |   cuGetErrorString   |     |
 +----------------------+-----+
-	
+
 3. Initialization
 -------------------
 
@@ -1480,7 +1480,7 @@ CUDA Driver API functions supported by HIP
 +------------------------------------+--------------------------------------------------------+
 |   cuGraphicsD3D9RegisterResource   |                                                        |
 +------------------------------------+--------------------------------------------------------+
-	
+
 27.1. Direct3D 9 Interoperability [DEPRECATED]
 ------------------------------------------------
 
diff --git a/Programming_Guides/CUDAAPIHIPTEXTURE.rst b/Programming_Guides/CUDAAPIHIPTEXTURE.rst
index 7b13131a..6bbcb136 100644
--- a/Programming_Guides/CUDAAPIHIPTEXTURE.rst
+++ b/Programming_Guides/CUDAAPIHIPTEXTURE.rst
@@ -623,7 +623,7 @@ CUDA Runtime API functions supported by HIP
 28. C++ API Routines
 -----------------------
 
-(7.0 contains, 7.5 doesn’t)
+(7.0 contains, 7.5 doesn't)
 
 +-------------------------------------------------------------+--------------------------------------------------+
 | CUDA                                                        | HIP                                              |
diff --git a/Programming_Guides/HIP-FAQ.rst b/Programming_Guides/HIP-FAQ.rst
index 0d58bc13..97c89028 100644
--- a/Programming_Guides/HIP-FAQ.rst
+++ b/Programming_Guides/HIP-FAQ.rst
@@ -45,7 +45,7 @@ See the `API Support Table <https://github.com/ROCm-Developer-Tools/HIP/blob/mas
     * PTX assembly (CUDA 4.0). HCC supports inline GCN assembly.
     * Several kernel features are under development. See the :ref:`Kernel Language` for more information. These include:
         *  printf
-        
+
 
 **Is HIP a drop-in replacement for CUDA?**
 
@@ -79,7 +79,7 @@ However, we can provide a rough summary of the features included in each CUDA SD
 
 **What libraries does HIP support?**
 
-HIP includes growing support for the 4 key math libraries using hcBlas, hcFft, hcrng and hcsparse, as well as MIOpen for machine intelligence applications. These offer pointer-based memory interfaces (as opposed to opaque buffers) and can be easily interfaced with other HIP applications. 
+HIP includes growing support for the 4 key math libraries using hcBlas, hcFft, hcrng and hcsparse, as well as MIOpen for machine intelligence applications. These offer pointer-based memory interfaces (as opposed to opaque buffers) and can be easily interfaced with other HIP applications.
 The hip interfaces support both ROCm and CUDA paths, with familliar library interfaces.
 
    * `hipblas <https://rocm-documentation.readthedocs.io/en/latest/ROCm_Libraries/ROCm_Libraries.html#hipblas>`_, which utilizes `rocBlas <https://rocm-documentation.readthedocs.io/en/latest/ROCm_Libraries/ROCm_Libraries.html#rocblas>`_.
@@ -93,7 +93,7 @@ Additionally, some of the cublas routines are automatically converted to hipblas
 
 Both AMD and Nvidia support OpenCL 1.2 on their devices, so developers can write portable code. HIP offers several benefits over OpenCL:
 
-   * Developers can code in C++ as well as mix host and device C++ code in their source files. HIP C++ code can use templates, 	  	lambdas, classes and so on. 
+   * Developers can code in C++ as well as mix host and device C++ code in their source files. HIP C++ code can use templates, 	  	lambdas, classes and so on.
    * The HIP API is less verbose than OpenCL and is familiar to CUDA developers.
    * Because both CUDA and HIP are C++ languages, porting from CUDA to HIP is significantly easier than porting from CUDA to OpenCL.
    * HIP uses the best available development tools on each platform: on Nvidia GPUs, HIP code compiles using NVCC and can employ the 	  nSight profiler and debugger (unlike OpenCL on Nvidia GPUs).
diff --git a/Programming_Guides/HIP-GUIDE.rst b/Programming_Guides/HIP-GUIDE.rst
index f032101a..88accc95 100644
--- a/Programming_Guides/HIP-GUIDE.rst
+++ b/Programming_Guides/HIP-GUIDE.rst
@@ -10,7 +10,7 @@ HIP provides a C++ syntax that is suitable for compiling most code that commonly
    * Math functions resembling those in the "math.h" header included with standard C++ compilers
    * Built-in functions for accessing specific GPU hardware capabilities
 
-This section describes the built-in variables and functions accessible from the HIP kernel. It’s intended for readers who are familiar with Cuda kernel syntax and want to understand how HIP is different.
+This section describes the built-in variables and functions accessible from the HIP kernel. It's intended for readers who are familiar with Cuda kernel syntax and want to understand how HIP is different.
 
 Features are marked with one of the following keywords:
 
@@ -21,9 +21,9 @@ Features are marked with one of the following keywords:
 
 
 
-Function-Type Qualifiers 
-************************* 
-   
+Function-Type Qualifiers
+*************************
+
 **__device__**
 
 Supported __device__ functions are
@@ -70,34 +70,34 @@ __global__ functions are often referred to as kernels, and calling one is termed
        *  hipStream_t: stream where the kernel should execute. A value of 0 corresponds to the NULL stream(see
           :ref:`Synchronization-Functions`).
    * Kernel arguments follow these first five parameters ::
-    
+
       //Example pseudo code introducing hipLaunchKernelGGL
       __global__ MyKernel(float *A, float *B, float *C, size_t N)
       {
       ...
-      } 
+      }
       //Replace MyKernel<<<dim3(gridDim), dim3(gridDim), 0, 0>>> (a,b,c,n);
       hipLaunchKernelGGL(MyKernel, dim3(gridDim), dim3(groupDim), 0/*dynamicShared*/, 0/*stream), a, b, c, n)
 
 
-The hipLaunchKernelGGL macro always starts with the five parameters specified above, followed by the kernel arguments. The Hipify script automatically converts Cuda launch syntax to hipLaunchKernelGGL, including conversion of optional arguments in <<< >>> to the five required hipLaunchKernelGGL parameters. The :ref:`dim3` constructor accepts zero to three arguments and will by default initialize unspecified dimensions to 1. See dim3. The kernel uses the coordinate built-ins (hipThread*, hipBlock*, hipGrid*) to determine coordinate index and coordinate bounds of the work item that’s currently executing. 
+The hipLaunchKernelGGL macro always starts with the five parameters specified above, followed by the kernel arguments. The Hipify script automatically converts Cuda launch syntax to hipLaunchKernelGGL, including conversion of optional arguments in <<< >>> to the five required hipLaunchKernelGGL parameters. The :ref:`dim3` constructor accepts zero to three arguments and will by default initialize unspecified dimensions to 1. See dim3. The kernel uses the coordinate built-ins (hipThread*, hipBlock*, hipGrid*) to determine coordinate index and coordinate bounds of the work item that's currently executing.
 
  .. _Kernel:
 
 Kernel-Launch Example
 +++++++++++++++++++++++
- 
+
  ::
- 
-  // Example showing device function, __device__ __host__   
-  // <- compile for both device and host 
-  float PlusOne(float x) 
+
+  // Example showing device function, __device__ __host__
+  // <- compile for both device and host
+  float PlusOne(float x)
   {
      return x + 1.0;
   }
 
-  __global__ 
-  void 
+  __global__
+  void
   MyKernel (const float *a, const float *b, float *c, unsigned N)
   {
      unsigned gid = hipThreadIdx_x; // <- coordinate index function
@@ -110,18 +110,18 @@ Kernel-Launch Example
      float *a, *b, *c; // initialization not shown...
      unsigned N = 1000000;
      const unsigned blockSize = 256;
-     hipLaunchKernelGGL(MyKernel, 
+     hipLaunchKernelGGL(MyKernel,
    (N/blockSize), dim3(blockSize), 0, 0,  a,b,c,N);
   }
 
 
- 
+
 
 Variable-Type Qualifiers
 ************************
 
 **__constant__**
- 
+
 The __constant__ keyword is supported. The host writes constant memory before launching the kernel; from the GPU, this memory is read-only during kernel execution. The functions for accessing constant memory (hipGetSymbolAddress(), hipGetSymbolSize(), hipMemcpyToSymbol(), hipMemcpyToSymbolAsync, hipMemcpyFromSymbol, hipMemcpyFromSymbolAsync) are under development.
 
 **__shared__**
@@ -149,19 +149,19 @@ These built-ins determine the coordinate of the active work item in the executio
 hipThreadIdx_x 	 threadIdx.x
 hipThreadIdx_y 	 threadIdx.y
 hipThreadIdx_z 	 threadIdx.z
-	
+
 hipBlockIdx_x 	 blockIdx.x
 
 hipBlockIdx_y 	 blockIdx.y
 
 hipBlockIdx_z 	 blockIdx.z
-	
+
 hipBlockDim_x 	 blockDim.x
 
 hipBlockDim_y 	 blockDim.y
 
 hipBlockDim_z 	 blockDim.z
-	
+
 hipGridDim_x 	 gridDim.x
 
 hipGridDim_y 	 gridDim.y
@@ -206,9 +206,9 @@ dim3
 dim3 is a three-dimensional integer vector type commonly used to specify grid and group dimensions. Unspecified dimensions are initialized to 1. ::
 
  typedef struct dim3 {
-   uint32_t x; 
-   uint32_t y; 
-   uint32_t z; 
+   uint32_t x;
+   uint32_t y;
+   uint32_t z;
 
    dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {};
  };
@@ -243,357 +243,357 @@ Following is the list of supported single precision mathematical functions.
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
 | Function                                                                                           | Supported on Host | Supported on Device |
 +====================================================================================================+===================+=====================+
-| float acosf ( float x )                                                                            | ✓                 | ✓                   |
+| float acosf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc cosine of the input argument.                                                    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float acoshf ( float x )                                                                           | ✓                 | ✓                   |
+| float acoshf ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the nonnegative arc hyperbolic cosine of the input argument.                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float asinf ( float x )                                                                            | ✓                 | ✓                   |
+| float asinf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc sine of the input argument.                                                      |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float asinhf ( float x )                                                                           | ✓                 | ✓                   |
+| float asinhf ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc hyperbolic sine of the input argument.                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float atan2f ( float y, float x )                                                                  | ✓                 | ✓                   |
+| float atan2f ( float y, float x )                                                                  | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc tangent of the ratio of first and second input arguments.                        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float atanf ( float x )                                                                            | ✓                 | ✓                   |
+| float atanf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc tangent of the input argument.                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float atanhf ( float x )                                                                           | ✓                 | ✓                   |
+| float atanhf ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc hyperbolic tangent of the input argument.                                        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float cbrtf ( float x )                                                                            | ✓                 | ✓                   |
+| float cbrtf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the cube root of the input argument.                                                     |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float ceilf ( float x )                                                                            | ✓                 | ✓                   |
+| float ceilf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate ceiling of the input argument.                                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float copysignf ( float x, float y )                                                               | ✓                 | ✓                   |
+| float copysignf ( float x, float y )                                                               | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Create value with given magnitude, copying sign of second value.                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float cosf ( float x )                                                                             | ✓                 | ✓                   |
+| float cosf ( float x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the cosine of the input argument.                                                        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float coshf ( float x )                                                                            | ✓                 | ✓                   |
+| float coshf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the hyperbolic cosine of the input argument.                                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float erfcf ( float x )                                                                            | ✓                 | ✓                   |
+| float erfcf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the complementary error function of the input argument.                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float erff ( float x )                                                                             | ✓                 | ✓                   |
+| float erff ( float x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the error function of the input argument.                                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float exp10f ( float x )                                                                           | ✓                 | ✓                   |
+| float exp10f ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base 10 exponential of the input argument.                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float exp2f ( float x )                                                                            | ✓                 | ✓                   |
+| float exp2f ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base 2 exponential of the input argument.                                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float expf ( float x )                                                                             | ✓                 | ✓                   |
+| float expf ( float x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base e exponential of the input argument.                                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float expm1f ( float x )                                                                           | ✓                 | ✓                   |
+| float expm1f ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base e exponential of the input argument, minus 1.                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float fabsf ( float x )                                                                            | ✓                 | ✓                   |
+| float fabsf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the absolute value of its argument.                                                      |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float fdimf ( float x, float y )                                                                   | ✓                 | ✓                   |
+| float fdimf ( float x, float y )                                                                   | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Compute the positive difference between x and y.                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float floorf ( float x )                                                                           | ✓                 | ✓                   |
+| float floorf ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the largest integer less than or equal to x.                                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float fmaf ( float x, float y, float z )                                                           | ✓                 | ✓                   |
+| float fmaf ( float x, float y, float z )                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
-| Compute x × y + z as a single operation.                                                           |                   |                     |
+| Compute x x y + z as a single operation.                                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float fmaxf ( float x, float y )                                                                   | ✓                 | ✓                   |
+| float fmaxf ( float x, float y )                                                                   | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Determine the maximum numeric value of the arguments.                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float fminf ( float x, float y )                                                                   | ✓                 | ✓                   |
+| float fminf ( float x, float y )                                                                   | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Determine the minimum numeric value of the arguments.                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float fmodf ( float x, float y )                                                                   | ✓                 | ✓                   |
+| float fmodf ( float x, float y )                                                                   | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the floating-point remainder of x / y.                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float frexpf ( float x, int* nptr )                                                                | ✓                 | ✗                   |
+| float frexpf ( float x, int* nptr )                                                                | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Extract mantissa and exponent of a floating-point value.                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float hypotf ( float x, float y )                                                                  | ✓                 | ✓                   |
+| float hypotf ( float x, float y )                                                                  | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the square root of the sum of squares of two arguments.                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| int ilogbf ( float x )                                                                             | ✓                 | ✓                   |
+| int ilogbf ( float x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Compute the unbiased integer exponent of the argument.                                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| __RETURN_TYPE1 isfinite ( float a )                                                                | ✓                 | ✓                   |
+| __RETURN_TYPE1 isfinite ( float a )                                                                | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Determine whether argument is finite.                                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| __RETURN_TYPE1 isinf ( float a )                                                                   | ✓                 | ✓                   |
+| __RETURN_TYPE1 isinf ( float a )                                                                   | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Determine whether argument is infinite.                                                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| __RETURN_TYPE1 isnan ( float a )                                                                   | ✓                 | ✓                   |
+| __RETURN_TYPE1 isnan ( float a )                                                                   | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Determine whether argument is a NaN.                                                               |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float ldexpf ( float x, int exp )                                                                  | ✓                 | ✓                   |
+| float ldexpf ( float x, int exp )                                                                  | ?                 | ?                   |
 |                                                                                                    |                   |                     |
-| Calculate the value of x ⋅ 2exp.                                                                   |                   |                     |
+| Calculate the value of x ? 2exp.                                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float log10f ( float x )                                                                           | ✓                 | ✓                   |
+| float log10f ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base 10 logarithm of the input argument.                                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float log1pf ( float x )                                                                           | ✓                 | ✓                   |
+| float log1pf ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of loge( 1 + x ).                                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float logbf ( float x )                                                                            | ✓                 | ✓                   |
+| float logbf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the floating point representation of the exponent of the input argument.                 |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float log2f ( float x )                                                                            | ✓                 | ✓                   |
+| float log2f ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base 2 logarithm of the input argument.                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float logf ( float x )                                                                             | ✓                 | ✓                   |
+| float logf ( float x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the natural logarithm of the input argument.                                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float modff ( float x, float* iptr )                                                               | ✓                 | ✗                   |
+| float modff ( float x, float* iptr )                                                               | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Break down the input argument into fractional and integral parts.                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float nanf ( const char* tagp )                                                                    | ✗                 | ✓                   |
+| float nanf ( const char* tagp )                                                                    | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Returns "Not a Number"" value."                                                                    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float nearbyintf ( float x )                                                                       | ✓                 | ✓                   |
+| float nearbyintf ( float x )                                                                       | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round the input argument to the nearest integer.                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float powf ( float x, float y )                                                                    | ✓                 | ✓                   |
+| float powf ( float x, float y )                                                                    | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of first argument to the power of second argument.                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float remainderf ( float x, float y )                                                              | ✓                 | ✓                   |
+| float remainderf ( float x, float y )                                                              | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Compute single-precision floating-point remainder.                                                 |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float remquof ( float x, float y, int* quo )                                                       | ✓                 | ✗                   |
+| float remquof ( float x, float y, int* quo )                                                       | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Compute single-precision floating-point remainder and part of quotient.                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float roundf ( float x )                                                                           | ✓                 | ✓                   |
+| float roundf ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round to nearest integer value in floating-point.                                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float scalbnf ( float x, int n )                                                                   | ✓                 | ✓                   |
+| float scalbnf ( float x, int n )                                                                   | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Scale floating-point input by integer power of two.                                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| __RETURN_TYPE1 signbit ( float a )                                                                 | ✓                 | ✓                   |
+| __RETURN_TYPE1 signbit ( float a )                                                                 | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Return the sign bit of the input.                                                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| void sincosf ( float x, float* sptr, float* cptr )                                                 | ✓                 | ✗                   |
+| void sincosf ( float x, float* sptr, float* cptr )                                                 | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the sine and cosine of the first input argument.                                         |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float sinf ( float x )                                                                             | ✓                 | ✓                   |
+| float sinf ( float x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the sine of the input argument.                                                          |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float sinhf ( float x )                                                                            | ✓                 | ✓                   |
+| float sinhf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the hyperbolic sine of the input argument.                                               |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float sqrtf ( float x )                                                                            | ✓                 | ✓                   |
+| float sqrtf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the square root of the input argument.                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float tanf ( float x )                                                                             | ✓                 | ✓                   |
+| float tanf ( float x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the tangent of the input argument.                                                       |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float tanhf ( float x )                                                                            | ✓                 | ✓                   |
+| float tanhf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the hyperbolic tangent of the input argument.                                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float truncf ( float x )                                                                           | ✓                 | ✓                   |
+| float truncf ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Truncate input argument to the integral part.                                                      |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float tgammaf ( float x )                                                                          | ✓                 | ✓                   |
+| float tgammaf ( float x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the gamma function of the input argument.                                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float erfcinvf ( float y )                                                                         | ✓                 | ✓                   |
+| float erfcinvf ( float y )                                                                         | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the inverse complementary function of the input argument.                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float erfcxf ( float x )                                                                           | ✓                 | ✓                   |
+| float erfcxf ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the scaled complementary error function of the input argument.                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float erfinvf ( float y )                                                                          | ✓                 | ✓                   |
+| float erfinvf ( float y )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the inverse error function of the input argument.                                        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float fdividef ( float x, float y )                                                                | ✓                 | ✓                   |
+| float fdividef ( float x, float y )                                                                | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Divide two floating point values.                                                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float frexpf ( float x, int *nptr )                                                                | ✓                 | ✓                   |
+| float frexpf ( float x, int *nptr )                                                                | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Extract mantissa and exponent of a floating-point value.                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float j0f ( float x )                                                                              | ✓                 | ✓                   |
+| float j0f ( float x )                                                                              | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the first kind of order 0 for the input argument.    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float j1f ( float x )                                                                              | ✓                 | ✓                   |
+| float j1f ( float x )                                                                              | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the first kind of order 1 for the input argument.    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float jnf ( int n, float x )                                                                       | ✓                 | ✓                   |
+| float jnf ( int n, float x )                                                                       | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the first kind of order n for the input argument.    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float lgammaf ( float x )                                                                          | ✓                 | ✓                   |
+| float lgammaf ( float x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the natural logarithm of the absolute value of the gamma function of the input argument. |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| long long int llrintf ( float x )                                                                  | ✓                 | ✓                   |
+| long long int llrintf ( float x )                                                                  | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round input to nearest integer value.                                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| long long int llroundf ( float x )                                                                 | ✓                 | ✓                   |
+| long long int llroundf ( float x )                                                                 | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round to nearest integer value.                                                                    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| long int lrintf ( float x )                                                                        | ✓                 | ✓                   |
+| long int lrintf ( float x )                                                                        | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round input to nearest integer value.                                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| long int lroundf ( float x )                                                                       | ✓                 | ✓                   |
+| long int lroundf ( float x )                                                                       | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round to nearest integer value.                                                                    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float modff ( float x, float *iptr )                                                               | ✓                 | ✓                   |
+| float modff ( float x, float *iptr )                                                               | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Break down the input argument into fractional and integral parts.                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float nextafterf ( float x, float y )                                                              | ✓                 | ✓                   |
+| float nextafterf ( float x, float y )                                                              | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Returns next representable single-precision floating-point value after argument.                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float norm3df ( float a, float b, float c )                                                        | ✓                 | ✓                   |
+| float norm3df ( float a, float b, float c )                                                        | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the square root of the sum of squares of three coordinates of the argument.              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float norm4df ( float a, float b, float c, float d )                                               | ✓                 | ✓                   |
+| float norm4df ( float a, float b, float c, float d )                                               | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the square root of the sum of squares of four coordinates of the argument.               |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float normcdff ( float y )                                                                         | ✓                 | ✓                   |
+| float normcdff ( float y )                                                                         | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the standard normal cumulative distribution function.                                    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float normcdfinvf ( float y )                                                                      | ✓                 | ✓                   |
+| float normcdfinvf ( float y )                                                                      | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the inverse of the standard normal cumulative distribution function.                     |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float normf ( int dim, const float *a )                                                            | ✓                 | ✓                   |
+| float normf ( int dim, const float *a )                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the square root of the sum of squares of any number of coordinates.                      |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float rcbrtf ( float x )                                                                           | ✓                 | ✓                   |
+| float rcbrtf ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the reciprocal cube root function.                                                       |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float remquof ( float x, float y, int *quo )                                                       | ✓                 | ✓                   |
+| float remquof ( float x, float y, int *quo )                                                       | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Compute single-precision floating-point remainder and part of quotient.                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float rhypotf ( float x, float y )                                                                 | ✓                 | ✓                   |
+| float rhypotf ( float x, float y )                                                                 | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate one over the square root of the sum of squares of two arguments.                         |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float rintf ( float x )                                                                            | ✓                 | ✓                   |
+| float rintf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round input to nearest integer value in floating-point.                                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float rnorm3df ( float a, float b, float c )                                                       | ✓                 | ✓                   |
+| float rnorm3df ( float a, float b, float c )                                                       | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate one over the square root of the sum of squares of three coordinates of the argument.     |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float rnorm4df ( float a, float b, float c, float d )                                              | ✓                 | ✓                   |
+| float rnorm4df ( float a, float b, float c, float d )                                              | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate one over the square root of the sum of squares of four coordinates of the argument.      |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float rnormf ( int dim, const float *a )                                                           | ✓                 | ✓                   |
+| float rnormf ( int dim, const float *a )                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the reciprocal of square root of the sum of squares of any number of coordinates.        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float scalblnf ( float x, long int n )                                                             | ✓                 | ✓                   |
+| float scalblnf ( float x, long int n )                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Scale floating-point input by integer power of two.                                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| void sincosf ( float x, float *sptr, float *cptr )                                                 | ✓                 | ✓                   |
+| void sincosf ( float x, float *sptr, float *cptr )                                                 | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the sine and cosine of the first input argument.                                         |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| void sincospif ( float x, float *sptr, float *cptr )                                               | ✓                 | ✓                   |
+| void sincospif ( float x, float *sptr, float *cptr )                                               | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the sine and cosine of the first input argument multiplied by PI.                        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float y0f ( float x )                                                                              | ✓                 | ✓                   |
+| float y0f ( float x )                                                                              | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the second kind of order 0 for the input argument.   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float y1f ( float x )                                                                              | ✓                 | ✓                   |
+| float y1f ( float x )                                                                              | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the second kind of order 1 for the input argument.   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float ynf ( int n, float x )                                                                       | ✓                 | ✓                   |
+| float ynf ( int n, float x )                                                                       | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the second kind of order n for the input argument.   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
 
 
-[1] __RETURN_TYPE is dependent on compiler. It is usually 'int' for C compilers and 'bool' for C++ compilers. 
+[1] __RETURN_TYPE is dependent on compiler. It is usually 'int' for C compilers and 'bool' for C++ compilers.
 
 **Double Precision Mathematical Functions**
 
@@ -603,348 +603,348 @@ Following is the list of supported double precision mathematical functions.
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
 | Function                                                                                           | Supported on Host | Supported on Device |
 +====================================================================================================+===================+=====================+
-| double acos ( double x )                                                                           | ✓                 | ✓                   |
+| double acos ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc cosine of the input argument.                                                    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double acosh ( double x )                                                                          | ✓                 | ✓                   |
+| double acosh ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the nonnegative arc hyperbolic cosine of the input argument.                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double asin ( double x )                                                                           | ✓                 | ✓                   |
+| double asin ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc sine of the input argument.                                                      |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double asinh ( double x )                                                                          | ✓                 | ✓                   |
+| double asinh ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc hyperbolic sine of the input argument.                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double atan ( double x )                                                                           | ✓                 | ✓                   |
+| double atan ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc tangent of the input argument.                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double atan2 ( double y, double x )                                                                | ✓                 | ✓                   |
+| double atan2 ( double y, double x )                                                                | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc tangent of the ratio of first and second input arguments.                        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double atanh ( double x )                                                                          | ✓                 | ✓                   |
+| double atanh ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc hyperbolic tangent of the input argument.                                        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double cbrt ( double x )                                                                           | ✓                 | ✓                   |
+| double cbrt ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the cube root of the input argument.                                                     |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double ceil ( double x )                                                                           | ✓                 | ✓                   |
+| double ceil ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate ceiling of the input argument.                                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double copysign ( double x, double y )                                                             | ✓                 | ✓                   |
+| double copysign ( double x, double y )                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Create value with given magnitude, copying sign of second value.                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double cos ( double x )                                                                            | ✓                 | ✓                   |
+| double cos ( double x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the cosine of the input argument.                                                        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double cosh ( double x )                                                                           | ✓                 | ✓                   |
+| double cosh ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the hyperbolic cosine of the input argument.                                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double erf ( double x )                                                                            | ✓                 | ✓                   |
+| double erf ( double x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the error function of the input argument.                                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double erfc ( double x )                                                                           | ✓                 | ✓                   |
+| double erfc ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the complementary error function of the input argument.                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double exp ( double x )                                                                            | ✓                 | ✓                   |
+| double exp ( double x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base e exponential of the input argument.                                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double exp10 ( double x )                                                                          | ✓                 | ✓                   |
+| double exp10 ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base 10 exponential of the input argument.                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double exp2 ( double x )                                                                           | ✓                 | ✓                   |
+| double exp2 ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base 2 exponential of the input argument.                                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double expm1 ( double x )                                                                          | ✓                 | ✓                   |
+| double expm1 ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base e exponential of the input argument, minus 1.                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double fabs ( double x )                                                                           | ✓                 | ✓                   |
+| double fabs ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the absolute value of the input argument.                                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double fdim ( double x, double y )                                                                 | ✓                 | ✓                   |
+| double fdim ( double x, double y )                                                                 | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Compute the positive difference between x and y.                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double floor ( double x )                                                                          | ✓                 | ✓                   |
+| double floor ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the largest integer less than or equal to x.                                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double fma ( double x, double y, double z )                                                        | ✓                 | ✓                   |
+| double fma ( double x, double y, double z )                                                        | ?                 | ?                   |
 |                                                                                                    |                   |                     |
-| Compute x × y + z as a single operation.                                                           |                   |                     |
+| Compute x x y + z as a single operation.                                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double fmax ( double , double )                                                                    | ✓                 | ✓                   |
+| double fmax ( double , double )                                                                    | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Determine the maximum numeric value of the arguments.                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double fmin ( double x, double y )                                                                 | ✓                 | ✓                   |
+| double fmin ( double x, double y )                                                                 | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Determine the minimum numeric value of the arguments.                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double fmod ( double x, double y )                                                                 | ✓                 | ✓                   |
+| double fmod ( double x, double y )                                                                 | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the floating-point remainder of x / y.                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double frexp ( double x, int* nptr )                                                               | ✓                 | ✗                   |
+| double frexp ( double x, int* nptr )                                                               | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Extract mantissa and exponent of a floating-point value.                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double hypot ( double x, double y )                                                                | ✓                 | ✓                   |
+| double hypot ( double x, double y )                                                                | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the square root of the sum of squares of two arguments.                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| int ilogb ( double x )                                                                             | ✓                 | ✓                   |
+| int ilogb ( double x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Compute the unbiased integer exponent of the argument.                                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| __RETURN_TYPE1 isfinite ( double a )                                                               | ✓                 | ✓                   |
+| __RETURN_TYPE1 isfinite ( double a )                                                               | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Determine whether argument is finite.                                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| __RETURN_TYPE1 isinf ( double a )                                                                  | ✓                 | ✓                   |
+| __RETURN_TYPE1 isinf ( double a )                                                                  | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Determine whether argument is infinite.                                                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| __RETURN_TYPE1 isnan ( double a )                                                                  | ✓                 | ✓                   |
+| __RETURN_TYPE1 isnan ( double a )                                                                  | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Determine whether argument is a NaN.                                                               |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double ldexp ( double x, int exp )                                                                 | ✓                 | ✓                   |
+| double ldexp ( double x, int exp )                                                                 | ?                 | ?                   |
 |                                                                                                    |                   |                     |
-| Calculate the value of x ⋅ 2exp.                                                                   |                   |                     |
+| Calculate the value of x ? 2exp.                                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double log ( double x )                                                                            | ✓                 | ✓                   |
+| double log ( double x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base e logarithm of the input argument.                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double log10 ( double x )                                                                          | ✓                 | ✓                   |
+| double log10 ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base 10 logarithm of the input argument.                                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double log1p ( double x )                                                                          | ✓                 | ✓                   |
+| double log1p ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of loge( 1 + x ).                                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double log2 ( double x )                                                                           | ✓                 | ✓                   |
+| double log2 ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base 2 logarithm of the input argument.                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double logb ( double x )                                                                           | ✓                 | ✓                   |
+| double logb ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the floating point representation of the exponent of the input argument.                 |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double modf ( double x, double* iptr )                                                             | ✓                 | ✗                   |
+| double modf ( double x, double* iptr )                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Break down the input argument into fractional and integral parts.                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double nan ( const char* tagp )                                                                    | ✗                 | ✓                   |
+| double nan ( const char* tagp )                                                                    | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Returns "Not a Number"" value."                                                                    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double nearbyint ( double x )                                                                      | ✓                 | ✓                   |
+| double nearbyint ( double x )                                                                      | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round the input argument to the nearest integer.                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double pow ( double x, double y )                                                                  | ✓                 | ✓                   |
+| double pow ( double x, double y )                                                                  | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of first argument to the power of second argument.                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double remainder ( double x, double y )                                                            | ✓                 | ✓                   |
+| double remainder ( double x, double y )                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Compute double-precision floating-point remainder.                                                 |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double remquo ( double x, double y, int* quo )                                                     | ✓                 | ✗                   |
+| double remquo ( double x, double y, int* quo )                                                     | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Compute double-precision floating-point remainder and part of quotient.                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double round ( double x )                                                                          | ✓                 | ✓                   |
+| double round ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round to nearest integer value in floating-point.                                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double scalbn ( double x, int n )                                                                  | ✓                 | ✓                   |
+| double scalbn ( double x, int n )                                                                  | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Scale floating-point input by integer power of two.                                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| __RETURN_TYPE1 signbit ( double a )                                                                | ✓                 | ✓                   |
+| __RETURN_TYPE1 signbit ( double a )                                                                | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Return the sign bit of the input.                                                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double sin ( double x )                                                                            | ✓                 | ✓                   |
+| double sin ( double x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the sine of the input argument.                                                          |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| void sincos ( double x, double* sptr, double* cptr )                                               | ✓                 | ✗                   |
+| void sincos ( double x, double* sptr, double* cptr )                                               | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the sine and cosine of the first input argument.                                         |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double sinh ( double x )                                                                           | ✓                 | ✓                   |
+| double sinh ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the hyperbolic sine of the input argument.                                               |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double sqrt ( double x )                                                                           | ✓                 | ✓                   |
+| double sqrt ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the square root of the input argument.                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double tan ( double x )                                                                            | ✓                 | ✓                   |
+| double tan ( double x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the tangent of the input argument.                                                       |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double tanh ( double x )                                                                           | ✓                 | ✓                   |
+| double tanh ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the hyperbolic tangent of the input argument.                                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double tgamma ( double x )                                                                         | ✓                 | ✓                   |
+| double tgamma ( double x )                                                                         | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the gamma function of the input argument.                                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double trunc ( double x )                                                                          | ✓                 | ✓                   |
+| double trunc ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Truncate input argument to the integral part.                                                      |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double erfcinv ( double y )                                                                        | ✓                 | ✓                   |
+| double erfcinv ( double y )                                                                        | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the inverse complementary function of the input argument.                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double erfcx ( double x )                                                                          | ✓                 | ✓                   |
+| double erfcx ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the scaled complementary error function of the input argument.                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double erfinv ( double y )                                                                         | ✓                 | ✓                   |
+| double erfinv ( double y )                                                                         | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the inverse error function of the input argument.                                        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double frexp ( float x, int *nptr )                                                                | ✓                 | ✓                   |
+| double frexp ( float x, int *nptr )                                                                | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Extract mantissa and exponent of a floating-point value.                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double j0 ( double x )                                                                             | ✓                 | ✓                   |
+| double j0 ( double x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the first kind of order 0 for the input argument.    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double j1 ( double x )                                                                             | ✓                 | ✓                   |
+| double j1 ( double x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the first kind of order 1 for the input argument.    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double jn ( int n, double x )                                                                      | ✓                 | ✓                   |
+| double jn ( int n, double x )                                                                      | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the first kind of order n for the input argument.    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double lgamma ( double x )                                                                         | ✓                 | ✓                   |
+| double lgamma ( double x )                                                                         | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the natural logarithm of the absolute value of the gamma function of the input argument. |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| long long int llrint ( double x )                                                                  | ✓                 | ✓                   |
+| long long int llrint ( double x )                                                                  | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round input to nearest integer value.                                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| long long int llround ( double x )                                                                 | ✓                 | ✓                   |
+| long long int llround ( double x )                                                                 | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round to nearest integer value.                                                                    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| long int lrint ( double x )                                                                        | ✓                 | ✓                   |
+| long int lrint ( double x )                                                                        | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round input to nearest integer value.                                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| long int lround ( double x )                                                                       | ✓                 | ✓                   |
+| long int lround ( double x )                                                                       | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round to nearest integer value.                                                                    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double modf ( double x, double *iptr )                                                             | ✓                 | ✓                   |
+| double modf ( double x, double *iptr )                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Break down the input argument into fractional and integral parts.                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double nextafter ( double x, double y )                                                            | ✓                 | ✓                   |
+| double nextafter ( double x, double y )                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Returns next representable single-precision floating-point value after argument.                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double norm3d ( double a, double b, double c )                                                     | ✓                 | ✓                   |
+| double norm3d ( double a, double b, double c )                                                     | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the square root of the sum of squares of three coordinates of the argument.              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float norm4d ( double a, double b, double c, double d )                                            | ✓                 | ✓                   |
+| float norm4d ( double a, double b, double c, double d )                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the square root of the sum of squares of four coordinates of the argument.               |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double normcdf ( double y )                                                                        | ✓                 | ✓                   |
+| double normcdf ( double y )                                                                        | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the standard normal cumulative distribution function.                                    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double normcdfinv ( double y )                                                                     | ✓                 | ✓                   |
+| double normcdfinv ( double y )                                                                     | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the inverse of the standard normal cumulative distribution function.                     |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double rcbrt ( double x )                                                                          | ✓                 | ✓                   |
+| double rcbrt ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the reciprocal cube root function.                                                       |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double remquo ( double x, double y, int *quo )                                                     | ✓                 | ✓                   |
+| double remquo ( double x, double y, int *quo )                                                     | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Compute single-precision floating-point remainder and part of quotient.                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double rhypot ( double x, double y )                                                               | ✓                 | ✓                   |
+| double rhypot ( double x, double y )                                                               | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate one over the square root of the sum of squares of two arguments.                         |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double rint ( double x )                                                                           | ✓                 | ✓                   |
+| double rint ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round input to nearest integer value in floating-point.                                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double rnorm3d ( double a, double b, double c )                                                    | ✓                 | ✓                   |
+| double rnorm3d ( double a, double b, double c )                                                    | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate one over the square root of the sum of squares of three coordinates of the argument.     |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double rnorm4d ( double a, double b, double c, double d )                                          | ✓                 | ✓                   |
+| double rnorm4d ( double a, double b, double c, double d )                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate one over the square root of the sum of squares of four coordinates of the argument.      |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double rnorm ( int dim, const double *a )                                                          | ✓                 | ✓                   |
+| double rnorm ( int dim, const double *a )                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the reciprocal of square root of the sum of squares of any number of coordinates.        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double scalbln ( double x, long int n )                                                            | ✓                 | ✓                   |
+| double scalbln ( double x, long int n )                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Scale floating-point input by integer power of two.                                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| void sincos ( double x, double *sptr, double *cptr )                                               | ✓                 | ✓                   |
+| void sincos ( double x, double *sptr, double *cptr )                                               | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the sine and cosine of the first input argument.                                         |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| void sincospi ( double x, double *sptr, double *cptr )                                             | ✓                 | ✓                   |
+| void sincospi ( double x, double *sptr, double *cptr )                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the sine and cosine of the first input argument multiplied by PI.                        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double y0f ( double x )                                                                            | ✓                 | ✓                   |
+| double y0f ( double x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the second kind of order 0 for the input argument.   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double y1 ( double x )                                                                             | ✓                 | ✓                   |
+| double y1 ( double x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the second kind of order 1 for the input argument.   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double yn ( int n, double x )                                                                      | ✓                 | ✓                   |
+| double yn ( int n, double x )                                                                      | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the second kind of order n for the input argument.   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
 
-[1] __RETURN_TYPE is dependent on compiler. It is usually 'int' for C compilers and 'bool' for C++ compilers. 
+[1] __RETURN_TYPE is dependent on compiler. It is usually 'int' for C compilers and 'bool' for C++ compilers.
 
 **Integer Intrinsics**
 
@@ -1038,23 +1038,23 @@ Following is the list of supported floating-point intrinsics. Note that intrinsi
 +----------------------------------------------------------------------------+
 |  float __frsqrt_rn ( float x )                                             |
 |                                                                            |
-|  Compute 1/√x in round-to-nearest-even mode.                               |
+|  Compute 1/?x in round-to-nearest-even mode.                               |
 +----------------------------------------------------------------------------+
 |  float __fsqrt_rd ( float x )                                              |
 |                                                                            |
-|  Compute √x in round-down mode.                                            |
+|  Compute ?x in round-down mode.                                            |
 +----------------------------------------------------------------------------+
 |  float __fsqrt_rn ( float x )                                              |
 |                                                                            |
-|  Compute √x in round-to-nearest-even mode.                                 |
+|  Compute ?x in round-to-nearest-even mode.                                 |
 +----------------------------------------------------------------------------+
 |  float __fsqrt_ru ( float x )                                              |
 |                                                                            |
-|  Compute √x in round-up mode.                                              |
+|  Compute ?x in round-up mode.                                              |
 +----------------------------------------------------------------------------+
 |  float __fsqrt_rz ( float x )                                              |
 |                                                                            |
-|  Compute √x in round-towards-zero mode.                                    |
+|  Compute ?x in round-towards-zero mode.                                    |
 +----------------------------------------------------------------------------+
 |  float __log10f ( float x )                                                |
 |                                                                            |
@@ -1082,19 +1082,19 @@ Following is the list of supported floating-point intrinsics. Note that intrinsi
 +----------------------------------------------------------------------------+
 |  double __dsqrt_rd ( double x )                                            |
 |                                                                            |
-|  Compute  √x in round-down mode.                                           |
+|  Compute  ?x in round-down mode.                                           |
 +----------------------------------------------------------------------------+
 |  double __dsqrt_rn ( double x )                                            |
 |                                                                            |
-|  Compute  √x in round-to-nearest-even mode.                                |
+|  Compute  ?x in round-to-nearest-even mode.                                |
 +----------------------------------------------------------------------------+
 |  double __dsqrt_ru ( double x )                                            |
 |                                                                            |
-|  Compute  √x in round-up mode.                                             |
+|  Compute  ?x in round-up mode.                                             |
 +----------------------------------------------------------------------------+
 |  double __dsqrt_rz ( double x )                                            |
 |                                                                            |
-|  Compute  √x in round-towards-zero mode.                                   |
+|  Compute  ?x in round-towards-zero mode.                                   |
 +----------------------------------------------------------------------------+
 
 Texture Functions
@@ -1123,65 +1123,65 @@ HIP supports the following atomic operations.
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
 | Function                                                                                                                    | Supported in HIP | Supported in CUDA |
 +=============================================================================================================================+==================+===================+
-| int atomicAdd(int* address, int val)                                                                                        | ✓                | ✓                 |
+| int atomicAdd(int* address, int val)                                                                                        | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicAdd(unsigned int* address,unsigned int val)                                                              | ✓                | ✓                 |
+| unsigned int atomicAdd(unsigned int* address,unsigned int val)                                                              | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val)                                | ✓                | ✓                 |
+| unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val)                                | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| float atomicAdd(float* address, float val)                                                                                  | ✓                | ✓                 |
+| float atomicAdd(float* address, float val)                                                                                  | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| int atomicSub(int* address, int val)                                                                                        | ✓                | ✓                 |
+| int atomicSub(int* address, int val)                                                                                        | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicSub(unsigned int* address,unsigned int val)                                                              | ✓                | ✓                 |
+| unsigned int atomicSub(unsigned int* address,unsigned int val)                                                              | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| int atomicExch(int* address, int val)                                                                                       | ✓                | ✓                 |
+| int atomicExch(int* address, int val)                                                                                       | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicExch(unsigned int* address,unsigned int val)                                                             | ✓                | ✓                 |
+| unsigned int atomicExch(unsigned int* address,unsigned int val)                                                             | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned long long int atomicExch(unsigned long long int* address,unsigned long long int val)                               | ✓                | ✓                 |
+| unsigned long long int atomicExch(unsigned long long int* address,unsigned long long int val)                               | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| float atomicExch(float* address, float val)                                                                                 | ✓                | ✓                 |
+| float atomicExch(float* address, float val)                                                                                 | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| int atomicMin(int* address, int val)                                                                                        | ✓                | ✓                 |
+| int atomicMin(int* address, int val)                                                                                        | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicMin(unsigned int* address,unsigned int val)                                                              | ✓                | ✓                 |
+| unsigned int atomicMin(unsigned int* address,unsigned int val)                                                              | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val)                                | ✓                | ✓                 |
+| unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val)                                | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| int atomicMax(int* address, int val)                                                                                        | ✓                | ✓                 |
+| int atomicMax(int* address, int val)                                                                                        | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicMax(unsigned int* address,unsigned int val)                                                              | ✓                | ✓                 |
+| unsigned int atomicMax(unsigned int* address,unsigned int val)                                                              | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned long long int atomicMax(unsigned long long int* address,unsigned long long int val)                                | ✓                | ✓                 |
+| unsigned long long int atomicMax(unsigned long long int* address,unsigned long long int val)                                | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicInc(unsigned int* address)                                                                               | ✗                | ✓                 |
+| unsigned int atomicInc(unsigned int* address)                                                                               | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicDec(unsigned int* address)                                                                               | ✗                | ✓                 |
+| unsigned int atomicDec(unsigned int* address)                                                                               | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| int atomicCAS(int* address, int compare, int val)                                                                           | ✓                | ✓                 |
+| int atomicCAS(int* address, int compare, int val)                                                                           | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicCAS(unsigned int* address,unsigned int compare,unsigned int val)                                         | ✓                | ✓                 |
+| unsigned int atomicCAS(unsigned int* address,unsigned int compare,unsigned int val)                                         | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned long long int atomicCAS(unsigned long long int* address,unsigned long long int compare,unsigned long long int val) | ✓                | ✓                 |
+| unsigned long long int atomicCAS(unsigned long long int* address,unsigned long long int compare,unsigned long long int val) | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| int atomicAnd(int* address, int val)                                                                                        | ✓                | ✓                 |
+| int atomicAnd(int* address, int val)                                                                                        | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicAnd(unsigned int* address,unsigned int val)                                                              | ✓                | ✓                 |
+| unsigned int atomicAnd(unsigned int* address,unsigned int val)                                                              | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val)                                | ✓                | ✓                 |
+| unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val)                                | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| int atomicOr(int* address, int val)                                                                                         | ✓                | ✓                 |
+| int atomicOr(int* address, int val)                                                                                         | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicOr(unsigned int* address,unsigned int val)                                                               | ✓                | ✓                 |
+| unsigned int atomicOr(unsigned int* address,unsigned int val)                                                               | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned long long int atomicOr(unsigned long long int* address,unsigned long long int val)                                 | ✓                | ✓                 |
+| unsigned long long int atomicOr(unsigned long long int* address,unsigned long long int val)                                 | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| int atomicXor(int* address, int val)                                                                                        | ✓                | ✓                 |
+| int atomicXor(int* address, int val)                                                                                        | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicXor(unsigned int* address,unsigned int val)                                                              | ✓                | ✓                 |
+| unsigned int atomicXor(unsigned int* address,unsigned int val)                                                              | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned long long int atomicXor(unsigned long long int* address,unsigned long long int val))                               | ✓                | ✓                 |
+| unsigned long long int atomicXor(unsigned long long int* address,unsigned long long int val))                               | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
 
 
@@ -1197,11 +1197,11 @@ Warp Cross Lane Functions
 
 Warp cross-lane functions operate across all lanes in a warp. The hardware guarantees that all warp lanes will execute in lockstep, so additional synchronization is unnecessary, and the instructions use no shared memory.
 
-Note that Nvidia and AMD devices have different warp sizes, so portable code should use the warpSize built-ins to query the warp size. Hipified code from the Cuda path requires careful review to ensure it doesn’t assume a waveSize of 32. "Wave-aware" code that assumes a waveSize of 32 will run on a wave-64 machine, but it will utilize only half of the machine resources. In addition to the warpSize device function, host code can obtain the warpSize from the device properties::
-    
+Note that Nvidia and AMD devices have different warp sizes, so portable code should use the warpSize built-ins to query the warp size. Hipified code from the Cuda path requires careful review to ensure it doesn't assume a waveSize of 32. "Wave-aware" code that assumes a waveSize of 32 will run on a wave-64 machine, but it will utilize only half of the machine resources. In addition to the warpSize device function, host code can obtain the warpSize from the device properties::
+
     cudaDeviceProp props;
     cudaGetDeviceProperties(&props, deviceID);
-    int w = props.warpSize;  
+    int w = props.warpSize;
     // implement portable algorithm based on w (rather than assume 32 or 64)
 
 **Warp Vote and Ballot Functions**
@@ -1219,14 +1219,14 @@ Threads in a warp are referred to as lanes and are numbered from 0 to warpSize -
 
 Applications can test whether the target platform supports the any/all instruction using the hasWarpVote device property or the HIP_ARCH_HAS_WARP_VOTE compiler define.
 
-__ballot provides a bit mask containing the 1-bit predicate value from each lane. The nth bit of the result contains the 1 bit contributed by the nth warp lane. Note that HIP's __ballot function supports a 64-bit return value (compared with Cuda’s 32 bits). Code ported from Cuda should support the larger warp sizes that the HIP version of this instruction supports. Applications can test whether the target platform supports the ballot instruction using the hasWarpBallot device property or the HIP_ARCH_HAS_WARP_BALLOT compiler define.
+__ballot provides a bit mask containing the 1-bit predicate value from each lane. The nth bit of the result contains the 1 bit contributed by the nth warp lane. Note that HIP's __ballot function supports a 64-bit return value (compared with Cuda's 32 bits). Code ported from Cuda should support the larger warp sizes that the HIP version of this instruction supports. Applications can test whether the target platform supports the ballot instruction using the hasWarpBallot device property or the HIP_ARCH_HAS_WARP_BALLOT compiler define.
 
 
 Warp Shuffle Functions
 ************************
 
 Half-float shuffles are not supported. The default width is warpSize---see :ref:`WarpCross` . Applications should not assume the warpSize is 32 or 64.
- 
+
  ::
 
    int   __shfl      (int var,   int srcLane, int width=warpSize);
@@ -1235,7 +1235,7 @@ Half-float shuffles are not supported. The default width is warpSize---see :ref:
    float __shfl_up   (float var, unsigned int delta, int width=warpSize);
    int   __shfl_down (int var,   unsigned int delta, int width=warpSize);
    float __shfl_down (float var, unsigned int delta, int width=warpSize) ;
-   int   __shfl_xor  (int var,   int laneMask, int width=warpSize) 
+   int   __shfl_xor  (int var,   int laneMask, int width=warpSize)
    float __shfl_xor  (float var, int laneMask, int width=warpSize);
 
 Profiler Counter Function
@@ -1263,7 +1263,7 @@ hip_launch_bounds allows the application to provide usage hints that influence t
 ::
 
   __global__ void `__launch_bounds__`(MAX_THREADS_PER_BLOCK, MIN_WARPS_PER_EU) MyKernel(...) ...
-    MyKernel(hipGridLaunch lp, ...) 
+    MyKernel(hipGridLaunch lp, ...)
     ...
 
 launch_bounds supports two parameters:
@@ -1295,7 +1295,7 @@ CUDA defines a __launch_bounds which is also designed to control occupancy: ::
    * The second parameter __launch_bounds parameters must be converted to the format used __hip_launch_bounds, which uses warps and 	 execution-units rather than blocks and multi-processors ( This conversion is performed automatically by the clang hipify tools.)
 
  ::
-   
+
    MIN_WARPS_PER_EXECUTION_UNIT = (MIN_BLOCKS_PER_MULTIPROCESSOR * MAX_THREADS_PER_BLOCK)/32
 
 
@@ -1320,14 +1320,14 @@ Unroll with a bounds that is known at compile-time is supported. For example::
   #pragma unroll 16 /* hint to compiler to unroll next loop by 16 */
   for (int i=0; i<16; i++) ...
 
-:: 
-  
+::
+
   #pragma unroll 1  /* tell compiler to never unroll the loop */
   for (int i=0; i<16; i++) ...
 
-Unbounded loop unroll is under development on HCC compiler. 
+Unbounded loop unroll is under development on HCC compiler.
 ::
-  
+
   #pragma unroll /* hint to compiler to completely unroll next loop. */
   for (int i=0; i<16; i++) ...
 
@@ -1348,12 +1348,12 @@ Kernel Compilation
 
 hipcc now supports compiling C++/HIP kernels to binary code objects. The user can specify the target for which the binary can be generated. HIP/HCC does not yet support fat binaries so only a single target may be specified. The file format for binary is .co which means Code Object. The following command builds the code object using hipcc.
 
-:: 
+::
 
    hipcc --genco --target-isa=[TARGET GPU] [INPUT FILE] -o [OUTPUT FILE]
 
 ::
-   
+
    [INPUT FILE] = Name of the file containing kernels
    [OUTPUT FILE] = Name of the generated code object file
 
diff --git a/Programming_Guides/HIP-porting-guide.rst b/Programming_Guides/HIP-porting-guide.rst
index a6315f25..b5c578a3 100644
--- a/Programming_Guides/HIP-porting-guide.rst
+++ b/Programming_Guides/HIP-porting-guide.rst
@@ -7,7 +7,7 @@ HIP Porting Guide
 ~~~~~~~~~~~~~~~~~
 
 In addition to providing a portable C++ programming environment for GPUs, HIP is designed to ease the porting of existing CUDA code into the HIP environment. This section describes the available tools and provides practical suggestions on how to port CUDA code and work through common issues.
-         
+
 
 Porting a New Cuda Project
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -24,8 +24,8 @@ Scanning existing CUDA code to scope the porting effort
 
 The hipexamine.sh tool will scan a source directory to determine which files contain CUDA code and how much of that code can be automatically hipified,
 
-:: 
-  
+::
+
   > cd examples/rodinia_3.0/cuda/kmeans
   > $HIP_DIR/bin/hipexamine.sh .
   info: hipify ./kmeans.h =====>
@@ -47,10 +47,10 @@ hipexamine scans each code file (cpp, c, h, hpp, etc) found in the specified dir
 
    * Files with no CUDA code (ie kmeans.h) print one line summary just listing the source file name.
    * Files with CUDA code print a summary of what was found - for example the kmeans_cuda_kernel.cu file:
-     :: 
-    
+     ::
+
       info: hipify ./kmeans_cuda_kernel.cu =====>
-      info: converted 40 CUDA->HIP refs( dev:0 mem:0 kern:0 builtin:37 math:0 stream:0 event:0 
+      info: converted 40 CUDA->HIP refs( dev:0 mem:0 kern:0 builtin:37 math:0 stream:0 event:0
 
    * Interesting information in kmeans_cuda_kernel.cu :
        * How many CUDA calls were converted to HIP (40)
@@ -60,7 +60,7 @@ hipexamine scans each code file (cpp, c, h, hpp, etc) found in the specified dir
 
    * hipexamine also presents a summary at the end of the process for the statistics collected across all files. This has similar format to the 	    	per-file reporting, and also includes a list of all kernels which have been called. An example from above:
 
-:: 
+::
 
   info: TOTAL-converted 89 CUDA->HIP refs( dev:3 mem:32 kern:2 builtin:37 math:0 stream:0 event:0 err:0 def:0 tex:15 other:0 ) warn:0 LOC:3607
   kernels (1 total) :   kmeansPoint(1)
@@ -68,7 +68,7 @@ hipexamine scans each code file (cpp, c, h, hpp, etc) found in the specified dir
 Converting a project "in-place"
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-:: 
+::
 
    > hipify --inplace
 
@@ -82,7 +82,7 @@ This is useful for testing improvements to the hipify toolset.
 The `hipconvertinplace.sh <https://github.com/ROCm-Developer-Tools/HIP/blob/master/bin/hipconvertinplace.sh>`_ script will perform inplace conversion for all code files in the specified directory. This can be quite handy when dealing with an existing CUDA code base since the script preserves the existing directory structure and filenames - so includes work. After converting in-place, you can review the code to add additional parameters to directory names.
 
 ::
-  
+
   > hipconvertinplace.sh MY_SRC_DIR
 
 Distinguishing Compiler Modes
@@ -103,27 +103,27 @@ Identifying the Compiler: hcc, hip-clang or nvcc
 
 Often, it useful to know whether the underlying compiler is hcc or nvcc. This knowledge can guard platform-specific code (features that only work on the nvcc or hcc path but not both) or aid in platform-specific performance tuning.
 
-:: 
+::
 
   #ifdef __HCC__
-  // Compiled with hcc 
- 
+  // Compiled with hcc
+
 ::
 
   #ifdef __HIP__
-  // Compiled with hip-clang 
+  // Compiled with hip-clang
 
 ::
 
   #ifdef __NVCC__
-  // Compiled with nvcc  
+  // Compiled with nvcc
   //  Could be compiling with Cuda language extensions enabled (for example, a ".cu file)
   //  Could be in pass-through mode to an underlying host compile OR (for example, a .cpp file)
- 
-:: 
+
+::
 
   #ifdef __CUDACC__
-  // Compiled with nvcc (Cuda language extensions enabled) 
+  // Compiled with nvcc (Cuda language extensions enabled)
 
 hcc and hip-clang directly generates the host code (using the Clang x86 target) and passes the code to another host compiler. Thus, they have no equivalent of the __CUDA_ACC define.
 
@@ -136,9 +136,9 @@ Identifying Current Compilation Pass: Host or Device
 Both nvcc and hcc make two passes over the code: one for host code and one for device code. __HIP_DEVICE_COMPILE__ is set to a nonzero value when the compiler (hcc or nvcc) is compiling code for a device inside a __global__ kernel or for a device function. __HIP_DEVICE_COMPILE__ can replace #ifdef checks on the __CUDA_ARCH__ define.
 
 ::
- 
-  // #ifdef __CUDA_ARCH__  
-  
+
+  // #ifdef __CUDA_ARCH__
+
   #if __HIP_DEVICE_COMPILE__
 
 Unlike __CUDA_ARCH__, the __HIP_DEVICE_COMPILE__ value is 1 or undefined, and it doesn't represent the feature capability of the target device.
@@ -149,48 +149,48 @@ Compiler Defines: Summary
  +---------------------------+-------------------------------+---------------------------------+--------------------------------------+
  |    Define                 |        hcc                    |   nvcc  		               |	Other (GCC, ICC, Clang, etc.) |
  +===========================+===============================+=================================+======================================+
- |HIP-related defines:                                                    						              | 	
+ |HIP-related defines:                                                    						              |
  +---------------------------+-------------------------------+---------------------------------+--------------------------------------+
  | __HIP_PLATFORM_HCC___     | Defined                       | Undefined                       | | Defined if targeting hcc platform; |
- |			     |                               |                                 | | undefined otherwise                | 
+ |			     |                               |                                 | | undefined otherwise                |
  |                           |                               |                                 |                                      |
- +---------------------------+-------------------------------+---------------------------------+--------------------------------------+		  
+ +---------------------------+-------------------------------+---------------------------------+--------------------------------------+
  | __HIP_PLATFORM_NVCC___    | Undefined                     | defined                         | | Defined if targeting NVcc platform;|
- |			     |                               |                                 | | undefined otherwise                | 
+ |			     |                               |                                 | | undefined otherwise                |
  |                           |                               |                                 |				      |
  +---------------------------+-------------------------------+---------------------------------+--------------------------------------+
- |			     | | 1 if compiling for device;  | | 1 if compiling for device;    | 				      | 
- |__HIP_DEVICE_COMPILE__     | | undefined if compiling      | | undefined if compiling        | Undefined			      | 	
- |			     | | for host		     | | for host		       |				      |	
+ |			     | | 1 if compiling for device;  | | 1 if compiling for device;    | 				      |
+ |__HIP_DEVICE_COMPILE__     | | undefined if compiling      | | undefined if compiling        | Undefined			      |
+ |			     | | for host		     | | for host		       |				      |
  +---------------------------+-------------------------------+---------------------------------+--------------------------------------+
  | __HIPCC__ 	             | Defined 	                     |  Defined 	               | Undefined		              |
- +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ 
+ +---------------------------+-------------------------------+---------------------------------+--------------------------------------+
  |			     | | 0 or 1 depending on feature | | 0 or 1 depending on feature   |  				      |
  | __HIP_ARCH_*  	     | | support (see below)	     | | support (see below)	       | 0				      |
  |			     |                               | 	                               |			              |
- +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ 
- | nvcc-related defines: 													      | 
- +---------------------------+-------------------------------+---------------------------------+--------------------------------------+ 
+ +---------------------------+-------------------------------+---------------------------------+--------------------------------------+
+ | nvcc-related defines: 													      |
+ +---------------------------+-------------------------------+---------------------------------+--------------------------------------+
  | __CUDACC__		     | Undefined 	             | | Defined if source code is     |				      |
  | 			     | 				     | | compiled by nvcc;  	       | Undefined		              |
  |                           |                               | | undefined otherwise 	       |				      |
  +---------------------------+-------------------------------+---------------------------------+--------------------------------------+
  | __NVCC__ 	             | Undefined 	             |  Defined 	               | Undefined 			      |
  +---------------------------+-------------------------------+---------------------------------+--------------------------------------+
- |			     |				     | | Unsigned representing compute |    				      |	
- | __CUDA_ARCH__	     | Undefined 		     | | capability (e.g., "130")if in | Undefined			      |	
+ |			     |				     | | Unsigned representing compute |    				      |
+ | __CUDA_ARCH__	     | Undefined 		     | | capability (e.g., "130")if in | Undefined			      |
  |			     |				     | | device code; 0 if in host code| 	 			      |
  +---------------------------+-------------------------------+---------------------------------+--------------------------------------+
  | hcc-related defines: 													      |
- +---------------------------+-------------------------------+---------------------------------+--------------------------------------+	 
+ +---------------------------+-------------------------------+---------------------------------+--------------------------------------+
  | __HCC__ 		     | 	Defined 	             |  Undefined 	               | Undefined		              |
  +---------------------------+-------------------------------+---------------------------------+--------------------------------------+
  |			     | | Nonzero if in device code;  |				       |				      |
- | __HCC_ACCELERATOR__	     | | otherwise undefined	     | Undefined		       | Undefined	                      |	  	
- |			     |	 		             |		 		       |	       		              |		 
+ | __HCC_ACCELERATOR__	     | | otherwise undefined	     | Undefined		       | Undefined	                      |
+ |			     |	 		             |		 		       |	       		              |
  +---------------------------+-------------------------------+---------------------------------+--------------------------------------+
  |                           |  Defined                      | Undefined                       | | Defined if using Clang;            |
- | __clang__                 |                               |                                 | | otherwise undefined                | 
+ | __clang__                 |                               |                                 | | otherwise undefined                |
  +---------------------------+-------------------------------+---------------------------------+--------------------------------------+
 
 Identifying Architecture Features
@@ -201,8 +201,8 @@ HIP_ARCH Defines
 Some Cuda code tests __CUDA_ARCH__ for a specific value to determine whether the machine supports a certain architectural feature. For instance,
 
 ::
-  
-  #if (__CUDA_ARCH__ >= 130) 
+
+  #if (__CUDA_ARCH__ >= 130)
   // doubles are supported
 
 
@@ -212,7 +212,7 @@ This type of code requires special attention, since hcc/AMD and nvcc/Cuda device
 The __HIP_ARCH_* defines can replace comparisons of __CUDA_ARCH__ values:
 
 ::
-  
+
   //#if (__CUDA_ARCH__ >= 130)   // non-portable
   if __HIP_ARCH_HAS_DOUBLES__ {  // portable HIP feature query
    // doubles are supported
@@ -241,32 +241,32 @@ The table below shows the full set of architectural properties that HIP supports
  |Define (use only in device code) 	    |  Device Property (run-time query) |	Comment					     |
  +------------------------------------------+-----------------------------------+----------------------------------------------------+
  | 32-bit atomics: 			    |											     |
- +------------------------------------------+-----------------------------------+----------------------------------------------------+		
+ +------------------------------------------+-----------------------------------+----------------------------------------------------+
  | __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__    |   hasGlobalInt32Atomics 	        | 32-bit integer atomics for global memory           |
  +------------------------------------------+-----------------------------------+----------------------------------------------------+
  | __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__|   hasGlobalFloatAtomicExch 	| 32-bit float atomic exchange for global memory     |
  +------------------------------------------+-----------------------------------+----------------------------------------------------+
- | __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__    |   hasSharedInt32Atomics 	        | 32-bit integer atomics for shared memory           | 
+ | __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__    |   hasSharedInt32Atomics 	        | 32-bit integer atomics for shared memory           |
  +------------------------------------------+-----------------------------------+----------------------------------------------------+
  | __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__|   hasSharedFloatAtomicExch 	| 32-bit float atomic exchange for shared memory     |
- +------------------------------------------+-----------------------------------+----------------------------------------------------+ 
- | __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ 	    |   hasFloatAtomicAdd 	        |32-bit float atomic add in global and shared memory | 
+ +------------------------------------------+-----------------------------------+----------------------------------------------------+
+ | __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ 	    |   hasFloatAtomicAdd 	        |32-bit float atomic add in global and shared memory |
  +------------------------------------------+-----------------------------------+----------------------------------------------------+
  | 64-bit atomics: 														     |
  +------------------------------------------+-----------------------------------+----------------------------------------------------+
- | __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__    |	hasGlobalInt64Atomics 	        | 64-bit integer atomics for global memory           |	
+ | __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__    |	hasGlobalInt64Atomics 	        | 64-bit integer atomics for global memory           |
  +------------------------------------------+-----------------------------------+----------------------------------------------------+
  | __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__    |	hasSharedInt64Atomics           | 64-bit integer atomics for shared memory           |
  +------------------------------------------+-----------------------------------+----------------------------------------------------+
- | Doubles: 										                                             | 		
+ | Doubles: 										                                             |
  +------------------------------------------+-----------------------------------+----------------------------------------------------+
- | __HIP_ARCH_HAS_DOUBLES__ 	            |   hasDoubles 	                |  Double-precision floating point		     |	
+ | __HIP_ARCH_HAS_DOUBLES__ 	            |   hasDoubles 	                |  Double-precision floating point		     |
  +------------------------------------------+-----------------------------------+----------------------------------------------------+
  | Warp cross-lane operations: 													     |
  +------------------------------------------+-----------------------------------+----------------------------------------------------+
  | __HIP_ARCH_HAS_WARP_VOTE__ 	            |   hasWarpVote 	                | Warp vote instructions (any, all)		     |
  +------------------------------------------+-----------------------------------+----------------------------------------------------+
- | __HIP_ARCH_HAS_WARP_BALLOT__ 	    |   hasWarpBallot                   | Warp ballot instructions			     |	  
+ | __HIP_ARCH_HAS_WARP_BALLOT__ 	    |   hasWarpBallot                   | Warp ballot instructions			     |
  +------------------------------------------+-----------------------------------+----------------------------------------------------+
  | __HIP_ARCH_HAS_WARP_SHUFFLE__ 	    |   hasWarpShuffle 	                | Warp shuffle operations (shfl_*)                   |
  +------------------------------------------+-----------------------------------+----------------------------------------------------+
@@ -277,15 +277,15 @@ The table below shows the full set of architectural properties that HIP supports
  | __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__     |  	hasThreadFenceSystem 	        | threadfence_system				     |
  +------------------------------------------+-----------------------------------+----------------------------------------------------+
  | __HIP_ARCH_HAS_SYNC_THREAD_EXT__         |	hasSyncThreadsExt 	        | syncthreads_count, syncthreads_and, syncthreads_or |
- +------------------------------------------+-----------------------------------+----------------------------------------------------+		
+ +------------------------------------------+-----------------------------------+----------------------------------------------------+
  | Miscellaneous: 		 											             |
  +------------------------------------------+-----------------------------------+----------------------------------------------------+
  | __HIP_ARCH_HAS_SURFACE_FUNCS__ 	    | hasSurfaceFuncs 	                |                                                    |
  +------------------------------------------+-----------------------------------+----------------------------------------------------+
  | __HIP_ARCH_HAS_3DGRID__                  |  has3dGrid 	                | Grids and groups are 3D                            |
  +------------------------------------------+-----------------------------------+----------------------------------------------------+
- | __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ 	    | hasDynamicParallelism             |  						     | 		
- +------------------------------------------+-----------------------------------+----------------------------------------------------+ 
+ | __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ 	    | hasDynamicParallelism             |  						     |
+ +------------------------------------------+-----------------------------------+----------------------------------------------------+
 
 
 Finding HIP
@@ -315,11 +315,11 @@ While this can be a convenient single-line kernel launch syntax, the macro imple
 Avoid nesting macro parameters inside parenthesis - here's an alternative that will work:
 
 ::
- 
+
   #define MY_LAUNCH(command, doTrace) \
   {\
      if (doTrace) printf ("TRACE: %s\n", #command); \
-     command;\ 
+     command;\
   }
 
   MY_LAUNCH (hipLaunchKernelGGL(vAdd, dim3(1024), dim3(1), 0, 0, Ad), true, "firstCall");
@@ -328,12 +328,12 @@ Compiler Options
 ~~~~~~~~~~~~~~~~
 hipcc is a portable compiler driver that will call nvcc or hcc (depending on the target system) and attach all required include and library options. It passes options through to the target compiler. Tools that call hipcc must ensure the compiler options are appropriate for the target compiler. The hipconfig script may helpful in making infrastructure that identifies the target platform and sets options appropriately. It returns either "nvcc" or "hcc." The following sample shows the script in a makefile:
 
-:: 
+::
 
   HIP_PLATFORM=$(shell hipconfig --compiler)
 
   ifeq (${HIP_PLATFORM}, nvcc)
-	  HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 
+	  HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20
   endif
   ifeq (${HIP_PLATFORM}, hcc)
   	HIPCC_FLAGS = -Wno-deprecated-register
@@ -387,7 +387,7 @@ You can compile hip_runtime_api.h using a standard C or C++ compiler (e.g., gcc
 
 ::
 
-  > hipconfig --cxx_config 
+  > hipconfig --cxx_config
   -D__HIP_PLATFORM_HCC__ -I/home/user1/hip/include
 
 You can capture the hipconfig output and passed it to the standard compiler; below is a sample makefile syntax:
@@ -470,7 +470,7 @@ Device Code:
       }
       std::cout<<"Passed"<<std::endl;
   }
- 
+
 
 threadfence_system
 ~~~~~~~~~~~~~~~~~~~
@@ -503,7 +503,7 @@ On an hcc/AMD platform, set the HIP_TRACE_API environment variable to see a text
 
 **Environment Variables**
 
-On hcc/AMD platforms, set the HIP_PRINT_ENV environment variable to 1 and run an application that calls a HIP API to see all HIP-supported 
+On hcc/AMD platforms, set the HIP_PRINT_ENV environment variable to 1 and run an application that calls a HIP API to see all HIP-supported
 environment variables and their current values:
 
   * HIP_PRINT_ENV = 1: print HIP environment variables
@@ -520,7 +520,7 @@ To see the detailed commands that hipcc issues, set the environment variable HIP
 
 ::
 
-   export HIPCC_VERBOSE=1 
+   export HIPCC_VERBOSE=1
    make
    ...
    hipcc-cmd: /opt/hcc/bin/hcc  -hc -I/opt/hcc/include -stdlib=libc++ -I../../../../hc/include -I../../../../include/hcc_detail/cuda -		    	I../../../../	include -x c++ -I../../common -O3 -c backprop_cuda.cu
@@ -533,9 +533,9 @@ If you pass a ``.cu`` file, hcc will attempt to compile it as a Cuda language fi
 
 HIP Environment Variables
 **************************
-On the HCC path, HIP provides a number of environment variables that control the behavior of HIP. Some of these are useful for application development (for example HIP_VISIBLE_DEVICES, HIP_LAUNCH_BLOCKING), some are useful for performance tuning or experimentation (for example ``HIP_STAGING*`` ), and some are useful for debugging (HIP_DB). You can see the environment variables supported by HIP as well as their current values and usage with the environment var "HIP_PRINT_ENV" - set this and then run any HIP application. 
+On the HCC path, HIP provides a number of environment variables that control the behavior of HIP. Some of these are useful for application development (for example HIP_VISIBLE_DEVICES, HIP_LAUNCH_BLOCKING), some are useful for performance tuning or experimentation (for example ``HIP_STAGING*`` ), and some are useful for debugging (HIP_DB). You can see the environment variables supported by HIP as well as their current values and usage with the environment var "HIP_PRINT_ENV" - set this and then run any HIP application.
 For example::
-  
+
   $ HIP_PRINT_ENV=1 ./myhipapp
   HIP_PRINT_ENV                  =  1 : Print HIP environment variables.
   HIP_LAUNCH_BLOCKING            =  0 : Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete.   	Alias: CUDA_LAUNCH_BLOCKING.
@@ -552,39 +552,39 @@ See the utils/vim or utils/gedit directories to add handy highlighting to hip fi
 **Library Equivalents**
 
 +-----------------------+-----------------------+-------------------------------------------------------------------------------------+
-|CUDA Library 	        |  ROCm Library	        |       Comment         	                                                      |     
+|CUDA Library 	        |  ROCm Library	        |       Comment         	                                                      |
 +-----------------------+-----------------------+-------------------------------------------------------------------------------------+
-|cuBLAS 	        | rocBLAS               |     Basic Linear Algebra Subroutines                                                |  
-|			|		        |		  		                                                      |  
+|cuBLAS 	        | rocBLAS               |     Basic Linear Algebra Subroutines                                                |
+|			|		        |		  		                                                      |
 +-----------------------+-----------------------+-------------------------------------------------------------------------------------+
-|cuFFT 	                | rocFFT                |     Fast Fourier Transfer Library                                                   |  
-|			|		        |		  		                                                      |  
+|cuFFT 	                | rocFFT                |     Fast Fourier Transfer Library                                                   |
+|			|		        |		  		                                                      |
 +-----------------------+-----------------------+-------------------------------------------------------------------------------------+
-|cuSPARSE 	        | rocSPARSE             |     Sparse BLAS  + SPMV                                                             |  
-|			|		        |		  		                                                      |  
+|cuSPARSE 	        | rocSPARSE             |     Sparse BLAS  + SPMV                                                             |
+|			|		        |		  		                                                      |
 +-----------------------+-----------------------+-------------------------------------------------------------------------------------+
-|cuSolver 	        | rocSolver             |     Lapack Library                                                                  |  
-|			|		        |		  		                                                      |  
+|cuSolver 	        | rocSolver             |     Lapack Library                                                                  |
+|			|		        |		  		                                                      |
 +-----------------------+-----------------------+-------------------------------------------------------------------------------------+
-|AMG-X                  | rocALUTION            | Sparse iterative solvers and preconditioners with Geometric and Algebraic MultiGrid |  
-|			|		        |		  		                                                      |  
+|AMG-X                  | rocALUTION            | Sparse iterative solvers and preconditioners with Geometric and Algebraic MultiGrid |
+|			|		        |		  		                                                      |
 +-----------------------+-----------------------+-------------------------------------------------------------------------------------+
 | Thrust                |    hipThrust          | C++ parallel algorithms library                                                     |
-|			|		        |		  		                                                      |  
+|			|		        |		  		                                                      |
 +-----------------------+-----------------------+-------------------------------------------------------------------------------------+
-| CUB                   |    rocPRIM            | Low Level Optimized Parallel Primitives                                             |  
-|			|		        |		  		                                                      |  
+| CUB                   |    rocPRIM            | Low Level Optimized Parallel Primitives                                             |
+|			|		        |		  		                                                      |
 +-----------------------+-----------------------+-------------------------------------------------------------------------------------+
 | cuDNN                 |    MIOpen             | Deep learning Solver Library                                                        |
-|			|		        |		  		                                                      |  
+|			|		        |		  		                                                      |
 +-----------------------+-----------------------+-------------------------------------------------------------------------------------+
 | cuRAND                |    rocRAND            | Random Number Generator Library                                                     |
-|			|		        |		  		                                                      |  
+|			|		        |		  		                                                      |
 +-----------------------+-----------------------+-------------------------------------------------------------------------------------+
-| EIGEN                 |    EIGEN – HIP port   | C++ template library for linear algebra: matrices, vectors, numerical solvers,      |  
-|			|		        |		  		                                                      |  
+| EIGEN                 |    EIGEN - HIP port   | C++ template library for linear algebra: matrices, vectors, numerical solvers,      |
+|			|		        |		  		                                                      |
 +-----------------------+-----------------------+-------------------------------------------------------------------------------------+
-| NCCL                  |    RCCL               | Communications Primitives Library based on the MPI equivalents                      | 
+| NCCL                  |    RCCL               | Communications Primitives Library based on the MPI equivalents                      |
 |			|		        |		  		                                                      |
 +-----------------------+-----------------------+-------------------------------------------------------------------------------------+
 
diff --git a/Programming_Guides/HIP_Debugging.rst b/Programming_Guides/HIP_Debugging.rst
index de359a95..f60bfca5 100644
--- a/Programming_Guides/HIP_Debugging.rst
+++ b/Programming_Guides/HIP_Debugging.rst
@@ -12,7 +12,7 @@ Table of Contents
 * :ref:`Chicken bits`
 * :ref:`Debugging HIP Applications`
 * :ref:`General Debugging Tips`
- 
+
   * :ref:`Print env var state`
 
 
@@ -32,12 +32,12 @@ This flag is primarily targeted to assist HIP development team in the developmen
 HIP_DB format is flags separated by '+' sign, or a hex code for the bitmask. Generally the + format is preferred.
 
 For example::
- 
+
  $ HIP_DB=api+copy+mem  my-application
  $ HIP_DB=0xF  my-application
 
 .. _Using ltrace:
-  
+
 Using ltrace
 -------------
 
@@ -45,11 +45,11 @@ ltrace is a standard linux tool which provides a message to stderr on every dyna
 
 ltrace can be easily combined with the HIP_DB switches to visualize the runtime behavior of the entire ROCm software stack. Here's a sample command-line and output::
 
- 
+
  $ HIP_DB=api ltrace -C -e 'hsa*'   <applicationName> <applicationArguments>
- 
+
  ...
- 
+
  <<hip-api tid:1.17 hipMemcpy (0x7f7776d3e010, 0x503d1d000, 4194304, hipMemcpyDeviceToHost)
  libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1804000, 0, 0, 0x400000) = 0
  libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1816000, 0, 0x7f777f85f2a0, 0x400000) = 0
@@ -64,9 +64,9 @@ ltrace can be easily combined with the HIP_DB switches to visualize the runtime
  libhsa-runtime64.so.1->hsaKmtUnmapMemoryToGPU(0x7f7776d3e010, 0x7f7776d3e010, 0x12c3c600000000, 0x1804000) = 0
  libhsa-runtime64.so.1->hsaKmtDeregisterMemory(0x7f7776d3e010, 0x7f7776d3e010, 0x7f777f60f9e8, 0x1220580) = 0
  <... hsa_amd_memory_unlock resumed> )            = 0
- hip-api tid:1.17 hipMemcpy                     
+ hip-api tid:1.17 hipMemcpy
  ret= 0 (hipSuccess)>>
- 
+
 
 Some key information from the trace above.
 
@@ -108,15 +108,15 @@ Debugging HIP Applications
 
 * The variable "tls_tidInfo" contains the API sequence number (_apiSeqNum)- a monotonically increasing count of the HIP APIs called from this thread. This can be useful for setting conditional breakpoints. Also, each new HIP thread is mapped to monotonically increasing shortTid ID. Both of these fields are displayed in the HIP debug info.
 
-  :: 
+  ::
 
    (gdb) p tls_tidInfo
    $32 = {_shortTid = 1, _apiSeqNum = 803}
- 
+
 
 * HCC tracks all of the application memory allocations, including those from HIP and HC's "am_alloc". If the HCC runtime is built with debug information (HCC_RUNTIME_DEBUG=ON when building HCC), then calling the function 'hc::am_memtracker_print()' will show all memory allocations. An optional argument specifies a void * targetPointer - the print routine will mark the allocation which contains the specified pointer with "-->" in the printed output. This example shows a sample GDB session where we print the memory allocated by this process and mark a specified address by using the gdb "call" function.. The gdb syntax also supports using the variable name (in this case 'dst'):
   ::
-   
+
    (gdb) p dst
    $33 = (void *) 0x5ec7e9000
    (gdb) call hc::am_memtracker_print(dst)
@@ -125,16 +125,16 @@ Debugging HIP Applications
    ...
    -->0x5ec7e9000-0x5f7e28fff::  allocSeqNum:488 hostPointer:(nil) devicePointer:0x5ec7e9000 sizeBytes:191102976 isInDeviceMem:1 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil)
 
-  To debug an explicit address, cast the address to (void*) 
+  To debug an explicit address, cast the address to (void*)
   ::
-   
+
    (gdb) call hc::am_memtracker_print((void*)0x508c7f000)
 
 * Debugging GPUVM fault. For example:
 
  Memory access fault by GPU node-1 on address 0x5924000. Reason: Page not present or supervisor privilege.
 ::
- 
+
  Program received signal SIGABRT, Aborted.
  [Switching to Thread 0x7fffdffb5700 (LWP 14893)]
  0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
@@ -163,9 +163,9 @@ Debugging HIP Applications
  #3  0x00007fffe080415f in HSADispatch::dispatchKernelAsync(Kalmar::HSAQueue*, void const*, int, bool) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so
  #4  0x00007fffe080238e in Kalmar::HSAQueue::dispatch_hsa_kernel(hsa_kernel_dispatch_packet_s const*, void const*, unsigned long, hc::completion_future*) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so
  #5  0x00007ffff7bb7559 in hipModuleLaunchKernel () from /opt/rocm/hip/lib/libhip_hcc.so
- #6  0x00007ffff2e6cd2c in mlopen::HIPOCKernel::run (this=0x7fffffffb5a8, args=0x7fffffffb2a8, size=80) at /root/MIOpen/src/hipoc/hipoc_kernel.cpp:15  
+ #6  0x00007ffff2e6cd2c in mlopen::HIPOCKernel::run (this=0x7fffffffb5a8, args=0x7fffffffb2a8, size=80) at /root/MIOpen/src/hipoc/hipoc_kernel.cpp:15
  ...
- 
+
 
 .. _General Debugging Tips:
 
diff --git a/Programming_Guides/Kernel_language.rst b/Programming_Guides/Kernel_language.rst
index faf5b330..de8bf4ea 100644
--- a/Programming_Guides/Kernel_language.rst
+++ b/Programming_Guides/Kernel_language.rst
@@ -63,7 +63,7 @@ HIP provides a C++ syntax that is suitable for compiling most code that commonly
 * Math functions resembling those in the "math.h" header included with standard C++ compilers
 * Built-in functions for accessing specific GPU hardware capabilities
 
-This section describes the built-in variables and functions accessible from the HIP kernel. It’s intended for readers who are familiar with Cuda kernel syntax and want to understand how HIP is different.
+This section describes the built-in variables and functions accessible from the HIP kernel. It's intended for readers who are familiar with Cuda kernel syntax and want to understand how HIP is different.
 
 Features are marked with one of the following keywords:
 
@@ -134,14 +134,14 @@ Calling __global__ Functions
   __global__ MyKernel(float *A, float *B, float *C, size_t N)
   {
   ...
-  } 
- 
+  }
+
   // Replace MyKernel<<<dim3(gridDim), dim3(gridDim), 0, 0>>> (a,b,c,n);
-  
+
   hipLaunchKernelGGL(MyKernel, dim3(gridDim), dim3(groupDim), 0/*dynamicShared*/, 0/*stream), a, b, c, n);
- 
 
-The hipLaunchKernelGGL macro always starts with the five parameters specified above, followed by the kernel arguments. The Hipify script automatically converts Cuda launch syntax to hipLaunchKernelGGL, including conversion of optional arguments in <<< >>> to the five required hipLaunchKernelGGL parameters. The dim3 constructor accepts zero to three arguments and will by default initialize unspecified dimensions to 1. See `dim3 <https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#dim3>`_. The kernel uses the coordinate built-ins (hipThread*, hipBlock*, hipGrid*) to determine coordinate index and coordinate bounds of the work item that’s currently executing. See `Coordinate Built-Ins <https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#coordinate-built-ins>`_.
+
+The hipLaunchKernelGGL macro always starts with the five parameters specified above, followed by the kernel arguments. The Hipify script automatically converts Cuda launch syntax to hipLaunchKernelGGL, including conversion of optional arguments in <<< >>> to the five required hipLaunchKernelGGL parameters. The dim3 constructor accepts zero to three arguments and will by default initialize unspecified dimensions to 1. See `dim3 <https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#dim3>`_. The kernel uses the coordinate built-ins (hipThread*, hipBlock*, hipGrid*) to determine coordinate index and coordinate bounds of the work item that's currently executing. See `Coordinate Built-Ins <https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#coordinate-built-ins>`_.
 
 .. _Kernel-Launch-Example:
 
@@ -150,15 +150,15 @@ Kernel-Launch Example
 
 ::
 
-  // Example showing device function, __device__ __host__   
-  // <- compile for both device and host 
-  float PlusOne(float x) 
+  // Example showing device function, __device__ __host__
+  // <- compile for both device and host
+  float PlusOne(float x)
   {
      return x + 1.0;
   }
- 
-  __global__ 
-  void 
+
+  __global__
+  void
   MyKernel (const float *a, const float *b, float *c, unsigned N)
   {
       unsigned gid = hipThreadIdx_x; // <- coordinate index function
@@ -170,11 +170,11 @@ Kernel-Launch Example
   {
      float *a, *b, *c; // initialization not shown...
      unsigned N = 1000000;
-     const unsigned blockSize = 256; 
- 
+     const unsigned blockSize = 256;
+
      hipLaunchKernelGGL(MyKernel, dim3(N/blockSize), dim3(blockSize), 0, 0,  a,b,c,N);
   }
- 
+
 .. _Variable-Type-Qualifiers:
 
 Variable-Type Qualifiers
@@ -260,13 +260,13 @@ Note that these types are defined in hip_runtime.h and are not automatically pro
 
 Short Vector Types
 ++++++++++++++++++++
-Short vector types derive from the basic integer and floating-point types. They are structures defined in hip_vector_types.h. The first, second, third and fourth components of the vector are accessible through the *x, y, z* and *w* fields, respectively. All the short vector types support a constructor function of the form make_<type_name>(). 
+Short vector types derive from the basic integer and floating-point types. They are structures defined in hip_vector_types.h. The first, second, third and fourth components of the vector are accessible through the *x, y, z* and *w* fields, respectively. All the short vector types support a constructor function of the form make_<type_name>().
 For example, ``float4 make_float4(float x, float y, float z, float w)`` creates a vector of type float4 and value (x,y,z,w).
 
 HIP supports the following short vector formats:
 
 * Signed Integers:
-  
+
   * char1, char2, char3, char4
   * short1, short2, short3, short4
   * int1, int2, int3, int4
@@ -274,7 +274,7 @@ HIP supports the following short vector formats:
   * longlong1, longlong2, longlong3, longlong4
 
 * Unsigned Integers:
-  
+
   * uchar1, uchar2, uchar3, uchar4
   * ushort1, ushort2, ushort3, ushort4
   * uint1, uint2, uint3, uint4
@@ -282,7 +282,7 @@ HIP supports the following short vector formats:
   * ulonglong1, ulonglong2, ulonglong3, ulonglong4
 
 * Floating Points
-  
+
   * float1, float2, float3, float4
   * double1, double2, double3, double4
 
@@ -295,13 +295,13 @@ dim3 is a three-dimensional integer vector type commonly used to specify grid an
 ::
 
  typedef struct dim3 {
-   uint32_t x; 
-   uint32_t y; 
-   uint32_t z;  
-   
-   dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {};  
-  }; 
- 
+   uint32_t x;
+   uint32_t y;
+   uint32_t z;
+
+   dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {};
+  };
+
 .. _Memory-Fence-Instructions:
 
 Memory-Fence Instructions
@@ -337,351 +337,351 @@ Following is the list of supported single precision mathematical functions.
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
 | Function                                                                                           | Supported on Host | Supported on Device |
 +====================================================================================================+===================+=====================+
-| float acosf ( float x )                                                                            | ✓                 | ✓                   |
+| float acosf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc cosine of the input argument.                                                    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float acoshf ( float x )                                                                           | ✓                 | ✓                   |
+| float acoshf ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the nonnegative arc hyperbolic cosine of the input argument.                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float asinf ( float x )                                                                            | ✓                 | ✓                   |
+| float asinf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc sine of the input argument.                                                      |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float asinhf ( float x )                                                                           | ✓                 | ✓                   |
+| float asinhf ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc hyperbolic sine of the input argument.                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float atan2f ( float y, float x )                                                                  | ✓                 | ✓                   |
+| float atan2f ( float y, float x )                                                                  | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc tangent of the ratio of first and second input arguments.                        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float atanf ( float x )                                                                            | ✓                 | ✓                   |
+| float atanf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc tangent of the input argument.                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float atanhf ( float x )                                                                           | ✓                 | ✓                   |
+| float atanhf ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc hyperbolic tangent of the input argument.                                        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float cbrtf ( float x )                                                                            | ✓                 | ✓                   |
+| float cbrtf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the cube root of the input argument.                                                     |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float ceilf ( float x )                                                                            | ✓                 | ✓                   |
+| float ceilf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate ceiling of the input argument.                                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float copysignf ( float x, float y )                                                               | ✓                 | ✓                   |
+| float copysignf ( float x, float y )                                                               | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Create value with given magnitude, copying sign of second value.                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float cosf ( float x )                                                                             | ✓                 | ✓                   |
+| float cosf ( float x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the cosine of the input argument.                                                        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float coshf ( float x )                                                                            | ✓                 | ✓                   |
+| float coshf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the hyperbolic cosine of the input argument.                                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float erfcf ( float x )                                                                            | ✓                 | ✓                   |
+| float erfcf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the complementary error function of the input argument.                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float erff ( float x )                                                                             | ✓                 | ✓                   |
+| float erff ( float x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the error function of the input argument.                                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float exp10f ( float x )                                                                           | ✓                 | ✓                   |
+| float exp10f ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base 10 exponential of the input argument.                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float exp2f ( float x )                                                                            | ✓                 | ✓                   |
+| float exp2f ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base 2 exponential of the input argument.                                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float expf ( float x )                                                                             | ✓                 | ✓                   |
+| float expf ( float x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base e exponential of the input argument.                                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float expm1f ( float x )                                                                           | ✓                 | ✓                   |
+| float expm1f ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base e exponential of the input argument, minus 1.                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float fabsf ( float x )                                                                            | ✓                 | ✓                   |
+| float fabsf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the absolute value of its argument.                                                      |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float fdimf ( float x, float y )                                                                   | ✓                 | ✓                   |
+| float fdimf ( float x, float y )                                                                   | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Compute the positive difference between x and y.                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float floorf ( float x )                                                                           | ✓                 | ✓                   |
+| float floorf ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the largest integer less than or equal to x.                                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float fmaf ( float x, float y, float z )                                                           | ✓                 | ✓                   |
+| float fmaf ( float x, float y, float z )                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
-| Compute x × y + z as a single operation.                                                           |                   |                     |
+| Compute x x y + z as a single operation.                                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float fmaxf ( float x, float y )                                                                   | ✓                 | ✓                   |
+| float fmaxf ( float x, float y )                                                                   | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Determine the maximum numeric value of the arguments.                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float fminf ( float x, float y )                                                                   | ✓                 | ✓                   |
+| float fminf ( float x, float y )                                                                   | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Determine the minimum numeric value of the arguments.                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float fmodf ( float x, float y )                                                                   | ✓                 | ✓                   |
+| float fmodf ( float x, float y )                                                                   | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the floating-point remainder of x / y.                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float frexpf ( float x, int* nptr )                                                                | ✓                 | ✗                   |
+| float frexpf ( float x, int* nptr )                                                                | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Extract mantissa and exponent of a floating-point value.                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float hypotf ( float x, float y )                                                                  | ✓                 | ✓                   |
+| float hypotf ( float x, float y )                                                                  | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the square root of the sum of squares of two arguments.                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| int ilogbf ( float x )                                                                             | ✓                 | ✓                   |
+| int ilogbf ( float x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Compute the unbiased integer exponent of the argument.                                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| __RETURN_TYPE1 isfinite ( float a )                                                                | ✓                 | ✓                   |
+| __RETURN_TYPE1 isfinite ( float a )                                                                | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Determine whether argument is finite.                                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| __RETURN_TYPE1 isinf ( float a )                                                                   | ✓                 | ✓                   |
+| __RETURN_TYPE1 isinf ( float a )                                                                   | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Determine whether argument is infinite.                                                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| __RETURN_TYPE1 isnan ( float a )                                                                   | ✓                 | ✓                   |
+| __RETURN_TYPE1 isnan ( float a )                                                                   | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Determine whether argument is a NaN.                                                               |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float ldexpf ( float x, int exp )                                                                  | ✓                 | ✓                   |
+| float ldexpf ( float x, int exp )                                                                  | ?                 | ?                   |
 |                                                                                                    |                   |                     |
-| Calculate the value of x ⋅ 2exp.                                                                   |                   |                     |
+| Calculate the value of x ? 2exp.                                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float log10f ( float x )                                                                           | ✓                 | ✓                   |
+| float log10f ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base 10 logarithm of the input argument.                                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float log1pf ( float x )                                                                           | ✓                 | ✓                   |
+| float log1pf ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of loge( 1 + x ).                                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float logbf ( float x )                                                                            | ✓                 | ✓                   |
+| float logbf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the floating point representation of the exponent of the input argument.                 |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float log2f ( float x )                                                                            | ✓                 | ✓                   |
+| float log2f ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base 2 logarithm of the input argument.                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float logf ( float x )                                                                             | ✓                 | ✓                   |
+| float logf ( float x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the natural logarithm of the input argument.                                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float modff ( float x, float* iptr )                                                               | ✓                 | ✗                   |
+| float modff ( float x, float* iptr )                                                               | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Break down the input argument into fractional and integral parts.                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float nanf ( const char* tagp )                                                                    | ✗                 | ✓                   |
+| float nanf ( const char* tagp )                                                                    | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Returns "Not a Number"" value."                                                                    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float nearbyintf ( float x )                                                                       | ✓                 | ✓                   |
+| float nearbyintf ( float x )                                                                       | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round the input argument to the nearest integer.                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float powf ( float x, float y )                                                                    | ✓                 | ✓                   |
+| float powf ( float x, float y )                                                                    | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of first argument to the power of second argument.                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float remainderf ( float x, float y )                                                              | ✓                 | ✓                   |
+| float remainderf ( float x, float y )                                                              | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Compute single-precision floating-point remainder.                                                 |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float remquof ( float x, float y, int* quo )                                                       | ✓                 | ✗                   |
+| float remquof ( float x, float y, int* quo )                                                       | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Compute single-precision floating-point remainder and part of quotient.                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float roundf ( float x )                                                                           | ✓                 | ✓                   |
+| float roundf ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round to nearest integer value in floating-point.                                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float scalbnf ( float x, int n )                                                                   | ✓                 | ✓                   |
+| float scalbnf ( float x, int n )                                                                   | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Scale floating-point input by integer power of two.                                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| __RETURN_TYPE1 signbit ( float a )                                                                 | ✓                 | ✓                   |
+| __RETURN_TYPE1 signbit ( float a )                                                                 | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Return the sign bit of the input.                                                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| void sincosf ( float x, float* sptr, float* cptr )                                                 | ✓                 | ✗                   |
+| void sincosf ( float x, float* sptr, float* cptr )                                                 | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the sine and cosine of the first input argument.                                         |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float sinf ( float x )                                                                             | ✓                 | ✓                   |
+| float sinf ( float x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the sine of the input argument.                                                          |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float sinhf ( float x )                                                                            | ✓                 | ✓                   |
+| float sinhf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the hyperbolic sine of the input argument.                                               |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float sqrtf ( float x )                                                                            | ✓                 | ✓                   |
+| float sqrtf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the square root of the input argument.                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float tanf ( float x )                                                                             | ✓                 | ✓                   |
+| float tanf ( float x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the tangent of the input argument.                                                       |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float tanhf ( float x )                                                                            | ✓                 | ✓                   |
+| float tanhf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the hyperbolic tangent of the input argument.                                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float truncf ( float x )                                                                           | ✓                 | ✓                   |
+| float truncf ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Truncate input argument to the integral part.                                                      |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float tgammaf ( float x )                                                                          | ✓                 | ✓                   |
+| float tgammaf ( float x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the gamma function of the input argument.                                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float erfcinvf ( float y )                                                                         | ✓                 | ✓                   |
+| float erfcinvf ( float y )                                                                         | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the inverse complementary function of the input argument.                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float erfcxf ( float x )                                                                           | ✓                 | ✓                   |
+| float erfcxf ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the scaled complementary error function of the input argument.                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float erfinvf ( float y )                                                                          | ✓                 | ✓                   |
+| float erfinvf ( float y )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the inverse error function of the input argument.                                        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float fdividef ( float x, float y )                                                                | ✓                 | ✓                   |
+| float fdividef ( float x, float y )                                                                | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Divide two floating point values.                                                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float frexpf ( float x, int *nptr )                                                                | ✓                 | ✓                   |
+| float frexpf ( float x, int *nptr )                                                                | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Extract mantissa and exponent of a floating-point value.                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float j0f ( float x )                                                                              | ✓                 | ✓                   |
+| float j0f ( float x )                                                                              | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the first kind of order 0 for the input argument.    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float j1f ( float x )                                                                              | ✓                 | ✓                   |
+| float j1f ( float x )                                                                              | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the first kind of order 1 for the input argument.    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float jnf ( int n, float x )                                                                       | ✓                 | ✓                   |
+| float jnf ( int n, float x )                                                                       | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the first kind of order n for the input argument.    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float lgammaf ( float x )                                                                          | ✓                 | ✓                   |
+| float lgammaf ( float x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the natural logarithm of the absolute value of the gamma function of the input argument. |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| long long int llrintf ( float x )                                                                  | ✓                 | ✓                   |
+| long long int llrintf ( float x )                                                                  | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round input to nearest integer value.                                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| long long int llroundf ( float x )                                                                 | ✓                 | ✓                   |
+| long long int llroundf ( float x )                                                                 | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round to nearest integer value.                                                                    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| long int lrintf ( float x )                                                                        | ✓                 | ✓                   |
+| long int lrintf ( float x )                                                                        | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round input to nearest integer value.                                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| long int lroundf ( float x )                                                                       | ✓                 | ✓                   |
+| long int lroundf ( float x )                                                                       | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round to nearest integer value.                                                                    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float modff ( float x, float *iptr )                                                               | ✓                 | ✓                   |
+| float modff ( float x, float *iptr )                                                               | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Break down the input argument into fractional and integral parts.                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float nextafterf ( float x, float y )                                                              | ✓                 | ✓                   |
+| float nextafterf ( float x, float y )                                                              | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Returns next representable single-precision floating-point value after argument.                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float norm3df ( float a, float b, float c )                                                        | ✓                 | ✓                   |
+| float norm3df ( float a, float b, float c )                                                        | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the square root of the sum of squares of three coordinates of the argument.              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float norm4df ( float a, float b, float c, float d )                                               | ✓                 | ✓                   |
+| float norm4df ( float a, float b, float c, float d )                                               | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the square root of the sum of squares of four coordinates of the argument.               |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float normcdff ( float y )                                                                         | ✓                 | ✓                   |
+| float normcdff ( float y )                                                                         | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the standard normal cumulative distribution function.                                    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float normcdfinvf ( float y )                                                                      | ✓                 | ✓                   |
+| float normcdfinvf ( float y )                                                                      | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the inverse of the standard normal cumulative distribution function.                     |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float normf ( int dim, const float *a )                                                            | ✓                 | ✓                   |
+| float normf ( int dim, const float *a )                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the square root of the sum of squares of any number of coordinates.                      |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float rcbrtf ( float x )                                                                           | ✓                 | ✓                   |
+| float rcbrtf ( float x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the reciprocal cube root function.                                                       |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float remquof ( float x, float y, int *quo )                                                       | ✓                 | ✓                   |
+| float remquof ( float x, float y, int *quo )                                                       | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Compute single-precision floating-point remainder and part of quotient.                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float rhypotf ( float x, float y )                                                                 | ✓                 | ✓                   |
+| float rhypotf ( float x, float y )                                                                 | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate one over the square root of the sum of squares of two arguments.                         |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float rintf ( float x )                                                                            | ✓                 | ✓                   |
+| float rintf ( float x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round input to nearest integer value in floating-point.                                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float rnorm3df ( float a, float b, float c )                                                       | ✓                 | ✓                   |
+| float rnorm3df ( float a, float b, float c )                                                       | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate one over the square root of the sum of squares of three coordinates of the argument.     |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float rnorm4df ( float a, float b, float c, float d )                                              | ✓                 | ✓                   |
+| float rnorm4df ( float a, float b, float c, float d )                                              | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate one over the square root of the sum of squares of four coordinates of the argument.      |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float rnormf ( int dim, const float *a )                                                           | ✓                 | ✓                   |
+| float rnormf ( int dim, const float *a )                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the reciprocal of square root of the sum of squares of any number of coordinates.        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float scalblnf ( float x, long int n )                                                             | ✓                 | ✓                   |
+| float scalblnf ( float x, long int n )                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Scale floating-point input by integer power of two.                                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| void sincosf ( float x, float *sptr, float *cptr )                                                 | ✓                 | ✓                   |
+| void sincosf ( float x, float *sptr, float *cptr )                                                 | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the sine and cosine of the first input argument.                                         |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| void sincospif ( float x, float *sptr, float *cptr )                                               | ✓                 | ✓                   |
+| void sincospif ( float x, float *sptr, float *cptr )                                               | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the sine and cosine of the first input argument multiplied by PI.                        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float y0f ( float x )                                                                              | ✓                 | ✓                   |
+| float y0f ( float x )                                                                              | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the second kind of order 0 for the input argument.   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float y1f ( float x )                                                                              | ✓                 | ✓                   |
+| float y1f ( float x )                                                                              | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the second kind of order 1 for the input argument.   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float ynf ( int n, float x )                                                                       | ✓                 | ✓                   |
+| float ynf ( int n, float x )                                                                       | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the second kind of order n for the input argument.   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
@@ -699,348 +699,348 @@ Following is the list of supported double precision mathematical functions.
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
 | Function                                                                                           | Supported on Host | Supported on Device |
 +====================================================================================================+===================+=====================+
-| double acos ( double x )                                                                           | ✓                 | ✓                   |
+| double acos ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc cosine of the input argument.                                                    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double acosh ( double x )                                                                          | ✓                 | ✓                   |
+| double acosh ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the nonnegative arc hyperbolic cosine of the input argument.                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double asin ( double x )                                                                           | ✓                 | ✓                   |
+| double asin ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc sine of the input argument.                                                      |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double asinh ( double x )                                                                          | ✓                 | ✓                   |
+| double asinh ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc hyperbolic sine of the input argument.                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double atan ( double x )                                                                           | ✓                 | ✓                   |
+| double atan ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc tangent of the input argument.                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double atan2 ( double y, double x )                                                                | ✓                 | ✓                   |
+| double atan2 ( double y, double x )                                                                | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc tangent of the ratio of first and second input arguments.                        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double atanh ( double x )                                                                          | ✓                 | ✓                   |
+| double atanh ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the arc hyperbolic tangent of the input argument.                                        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double cbrt ( double x )                                                                           | ✓                 | ✓                   |
+| double cbrt ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the cube root of the input argument.                                                     |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double ceil ( double x )                                                                           | ✓                 | ✓                   |
+| double ceil ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate ceiling of the input argument.                                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double copysign ( double x, double y )                                                             | ✓                 | ✓                   |
+| double copysign ( double x, double y )                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Create value with given magnitude, copying sign of second value.                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double cos ( double x )                                                                            | ✓                 | ✓                   |
+| double cos ( double x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the cosine of the input argument.                                                        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double cosh ( double x )                                                                           | ✓                 | ✓                   |
+| double cosh ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the hyperbolic cosine of the input argument.                                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double erf ( double x )                                                                            | ✓                 | ✓                   |
+| double erf ( double x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the error function of the input argument.                                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double erfc ( double x )                                                                           | ✓                 | ✓                   |
+| double erfc ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the complementary error function of the input argument.                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double exp ( double x )                                                                            | ✓                 | ✓                   |
+| double exp ( double x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base e exponential of the input argument.                                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double exp10 ( double x )                                                                          | ✓                 | ✓                   |
+| double exp10 ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base 10 exponential of the input argument.                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double exp2 ( double x )                                                                           | ✓                 | ✓                   |
+| double exp2 ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base 2 exponential of the input argument.                                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double expm1 ( double x )                                                                          | ✓                 | ✓                   |
+| double expm1 ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base e exponential of the input argument, minus 1.                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double fabs ( double x )                                                                           | ✓                 | ✓                   |
+| double fabs ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the absolute value of the input argument.                                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double fdim ( double x, double y )                                                                 | ✓                 | ✓                   |
+| double fdim ( double x, double y )                                                                 | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Compute the positive difference between x and y.                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double floor ( double x )                                                                          | ✓                 | ✓                   |
+| double floor ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the largest integer less than or equal to x.                                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double fma ( double x, double y, double z )                                                        | ✓                 | ✓                   |
+| double fma ( double x, double y, double z )                                                        | ?                 | ?                   |
 |                                                                                                    |                   |                     |
-| Compute x × y + z as a single operation.                                                           |                   |                     |
+| Compute x x y + z as a single operation.                                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double fmax ( double , double )                                                                    | ✓                 | ✓                   |
+| double fmax ( double , double )                                                                    | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Determine the maximum numeric value of the arguments.                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double fmin ( double x, double y )                                                                 | ✓                 | ✓                   |
+| double fmin ( double x, double y )                                                                 | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Determine the minimum numeric value of the arguments.                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double fmod ( double x, double y )                                                                 | ✓                 | ✓                   |
+| double fmod ( double x, double y )                                                                 | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the floating-point remainder of x / y.                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double frexp ( double x, int* nptr )                                                               | ✓                 | ✗                   |
+| double frexp ( double x, int* nptr )                                                               | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Extract mantissa and exponent of a floating-point value.                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double hypot ( double x, double y )                                                                | ✓                 | ✓                   |
+| double hypot ( double x, double y )                                                                | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the square root of the sum of squares of two arguments.                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| int ilogb ( double x )                                                                             | ✓                 | ✓                   |
+| int ilogb ( double x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Compute the unbiased integer exponent of the argument.                                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| __RETURN_TYPE1 isfinite ( double a )                                                               | ✓                 | ✓                   |
+| __RETURN_TYPE1 isfinite ( double a )                                                               | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Determine whether argument is finite.                                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| __RETURN_TYPE1 isinf ( double a )                                                                  | ✓                 | ✓                   |
+| __RETURN_TYPE1 isinf ( double a )                                                                  | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Determine whether argument is infinite.                                                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| __RETURN_TYPE1 isnan ( double a )                                                                  | ✓                 | ✓                   |
+| __RETURN_TYPE1 isnan ( double a )                                                                  | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Determine whether argument is a NaN.                                                               |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double ldexp ( double x, int exp )                                                                 | ✓                 | ✓                   |
+| double ldexp ( double x, int exp )                                                                 | ?                 | ?                   |
 |                                                                                                    |                   |                     |
-| Calculate the value of x ⋅ 2exp.                                                                   |                   |                     |
+| Calculate the value of x ? 2exp.                                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double log ( double x )                                                                            | ✓                 | ✓                   |
+| double log ( double x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base e logarithm of the input argument.                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double log10 ( double x )                                                                          | ✓                 | ✓                   |
+| double log10 ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base 10 logarithm of the input argument.                                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double log1p ( double x )                                                                          | ✓                 | ✓                   |
+| double log1p ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of loge( 1 + x ).                                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double log2 ( double x )                                                                           | ✓                 | ✓                   |
+| double log2 ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the base 2 logarithm of the input argument.                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double logb ( double x )                                                                           | ✓                 | ✓                   |
+| double logb ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the floating point representation of the exponent of the input argument.                 |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double modf ( double x, double* iptr )                                                             | ✓                 | ✗                   |
+| double modf ( double x, double* iptr )                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Break down the input argument into fractional and integral parts.                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double nan ( const char* tagp )                                                                    | ✗                 | ✓                   |
+| double nan ( const char* tagp )                                                                    | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Returns "Not a Number"" value."                                                                    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double nearbyint ( double x )                                                                      | ✓                 | ✓                   |
+| double nearbyint ( double x )                                                                      | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round the input argument to the nearest integer.                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double pow ( double x, double y )                                                                  | ✓                 | ✓                   |
+| double pow ( double x, double y )                                                                  | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of first argument to the power of second argument.                             |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double remainder ( double x, double y )                                                            | ✓                 | ✓                   |
+| double remainder ( double x, double y )                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Compute double-precision floating-point remainder.                                                 |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double remquo ( double x, double y, int* quo )                                                     | ✓                 | ✗                   |
+| double remquo ( double x, double y, int* quo )                                                     | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Compute double-precision floating-point remainder and part of quotient.                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double round ( double x )                                                                          | ✓                 | ✓                   |
+| double round ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round to nearest integer value in floating-point.                                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double scalbn ( double x, int n )                                                                  | ✓                 | ✓                   |
+| double scalbn ( double x, int n )                                                                  | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Scale floating-point input by integer power of two.                                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| __RETURN_TYPE1 signbit ( double a )                                                                | ✓                 | ✓                   |
+| __RETURN_TYPE1 signbit ( double a )                                                                | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Return the sign bit of the input.                                                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double sin ( double x )                                                                            | ✓                 | ✓                   |
+| double sin ( double x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the sine of the input argument.                                                          |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| void sincos ( double x, double* sptr, double* cptr )                                               | ✓                 | ✗                   |
+| void sincos ( double x, double* sptr, double* cptr )                                               | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the sine and cosine of the first input argument.                                         |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double sinh ( double x )                                                                           | ✓                 | ✓                   |
+| double sinh ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the hyperbolic sine of the input argument.                                               |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double sqrt ( double x )                                                                           | ✓                 | ✓                   |
+| double sqrt ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the square root of the input argument.                                                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double tan ( double x )                                                                            | ✓                 | ✓                   |
+| double tan ( double x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the tangent of the input argument.                                                       |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double tanh ( double x )                                                                           | ✓                 | ✓                   |
+| double tanh ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the hyperbolic tangent of the input argument.                                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double tgamma ( double x )                                                                         | ✓                 | ✓                   |
+| double tgamma ( double x )                                                                         | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the gamma function of the input argument.                                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double trunc ( double x )                                                                          | ✓                 | ✓                   |
+| double trunc ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Truncate input argument to the integral part.                                                      |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double erfcinv ( double y )                                                                        | ✓                 | ✓                   |
+| double erfcinv ( double y )                                                                        | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the inverse complementary function of the input argument.                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double erfcx ( double x )                                                                          | ✓                 | ✓                   |
+| double erfcx ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the scaled complementary error function of the input argument.                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double erfinv ( double y )                                                                         | ✓                 | ✓                   |
+| double erfinv ( double y )                                                                         | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the inverse error function of the input argument.                                        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double frexp ( float x, int *nptr )                                                                | ✓                 | ✓                   |
+| double frexp ( float x, int *nptr )                                                                | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Extract mantissa and exponent of a floating-point value.                                           |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double j0 ( double x )                                                                             | ✓                 | ✓                   |
+| double j0 ( double x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the first kind of order 0 for the input argument.    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double j1 ( double x )                                                                             | ✓                 | ✓                   |
+| double j1 ( double x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the first kind of order 1 for the input argument.    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double jn ( int n, double x )                                                                      | ✓                 | ✓                   |
+| double jn ( int n, double x )                                                                      | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the first kind of order n for the input argument.    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double lgamma ( double x )                                                                         | ✓                 | ✓                   |
+| double lgamma ( double x )                                                                         | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the natural logarithm of the absolute value of the gamma function of the input argument. |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| long long int llrint ( double x )                                                                  | ✓                 | ✓                   |
+| long long int llrint ( double x )                                                                  | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round input to nearest integer value.                                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| long long int llround ( double x )                                                                 | ✓                 | ✓                   |
+| long long int llround ( double x )                                                                 | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round to nearest integer value.                                                                    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| long int lrint ( double x )                                                                        | ✓                 | ✓                   |
+| long int lrint ( double x )                                                                        | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round input to nearest integer value.                                                              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| long int lround ( double x )                                                                       | ✓                 | ✓                   |
+| long int lround ( double x )                                                                       | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round to nearest integer value.                                                                    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double modf ( double x, double *iptr )                                                             | ✓                 | ✓                   |
+| double modf ( double x, double *iptr )                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Break down the input argument into fractional and integral parts.                                  |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double nextafter ( double x, double y )                                                            | ✓                 | ✓                   |
+| double nextafter ( double x, double y )                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Returns next representable single-precision floating-point value after argument.                   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double norm3d ( double a, double b, double c )                                                     | ✓                 | ✓                   |
+| double norm3d ( double a, double b, double c )                                                     | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the square root of the sum of squares of three coordinates of the argument.              |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| float norm4d ( double a, double b, double c, double d )                                            | ✓                 | ✓                   |
+| float norm4d ( double a, double b, double c, double d )                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the square root of the sum of squares of four coordinates of the argument.               |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double normcdf ( double y )                                                                        | ✓                 | ✓                   |
+| double normcdf ( double y )                                                                        | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the standard normal cumulative distribution function.                                    |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double normcdfinv ( double y )                                                                     | ✓                 | ✓                   |
+| double normcdfinv ( double y )                                                                     | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the inverse of the standard normal cumulative distribution function.                     |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double rcbrt ( double x )                                                                          | ✓                 | ✓                   |
+| double rcbrt ( double x )                                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the reciprocal cube root function.                                                       |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double remquo ( double x, double y, int *quo )                                                     | ✓                 | ✓                   |
+| double remquo ( double x, double y, int *quo )                                                     | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Compute single-precision floating-point remainder and part of quotient.                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double rhypot ( double x, double y )                                                               | ✓                 | ✓                   |
+| double rhypot ( double x, double y )                                                               | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate one over the square root of the sum of squares of two arguments.                         |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double rint ( double x )                                                                           | ✓                 | ✓                   |
+| double rint ( double x )                                                                           | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Round input to nearest integer value in floating-point.                                            |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double rnorm3d ( double a, double b, double c )                                                    | ✓                 | ✓                   |
+| double rnorm3d ( double a, double b, double c )                                                    | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate one over the square root of the sum of squares of three coordinates of the argument.     |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double rnorm4d ( double a, double b, double c, double d )                                          | ✓                 | ✓                   |
+| double rnorm4d ( double a, double b, double c, double d )                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate one over the square root of the sum of squares of four coordinates of the argument.      |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double rnorm ( int dim, const double *a )                                                          | ✓                 | ✓                   |
+| double rnorm ( int dim, const double *a )                                                          | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the reciprocal of square root of the sum of squares of any number of coordinates.        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double scalbln ( double x, long int n )                                                            | ✓                 | ✓                   |
+| double scalbln ( double x, long int n )                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Scale floating-point input by integer power of two.                                                |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| void sincos ( double x, double *sptr, double *cptr )                                               | ✓                 | ✓                   |
+| void sincos ( double x, double *sptr, double *cptr )                                               | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the sine and cosine of the first input argument.                                         |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| void sincospi ( double x, double *sptr, double *cptr )                                             | ✓                 | ✓                   |
+| void sincospi ( double x, double *sptr, double *cptr )                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the sine and cosine of the first input argument multiplied by PI.                        |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double y0f ( double x )                                                                            | ✓                 | ✓                   |
+| double y0f ( double x )                                                                            | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the second kind of order 0 for the input argument.   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double y1 ( double x )                                                                             | ✓                 | ✓                   |
+| double y1 ( double x )                                                                             | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the second kind of order 1 for the input argument.   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
-| double yn ( int n, double x )                                                                      | ✓                 | ✓                   |
+| double yn ( int n, double x )                                                                      | ?                 | ?                   |
 |                                                                                                    |                   |                     |
 | Calculate the value of the Bessel function of the second kind of order n for the input argument.   |                   |                     |
 +----------------------------------------------------------------------------------------------------+-------------------+---------------------+
 
-[1] __RETURN_TYPE is dependent on compiler. It is usually 'int' for C compilers and 'bool' for C++ compilers. 
+[1] __RETURN_TYPE is dependent on compiler. It is usually 'int' for C compilers and 'bool' for C++ compilers.
 
 .. _Integer-Intrinsics:
 
@@ -1135,23 +1135,23 @@ Following is the list of supported floating-point intrinsics. Note that intrinsi
 +----------------------------------------------------------------------------+
 |  float __frsqrt_rn ( float x )                                             |
 |                                                                            |
-|  Compute 1/√x in round-to-nearest-even mode.                               |
+|  Compute 1/?x in round-to-nearest-even mode.                               |
 +----------------------------------------------------------------------------+
 |  float __fsqrt_rd ( float x )                                              |
 |                                                                            |
-|  Compute √x in round-down mode.                                            |
+|  Compute ?x in round-down mode.                                            |
 +----------------------------------------------------------------------------+
 |  float __fsqrt_rn ( float x )                                              |
 |                                                                            |
-|  Compute √x in round-to-nearest-even mode.                                 |
+|  Compute ?x in round-to-nearest-even mode.                                 |
 +----------------------------------------------------------------------------+
 |  float __fsqrt_ru ( float x )                                              |
 |                                                                            |
-|  Compute √x in round-up mode.                                              |
+|  Compute ?x in round-up mode.                                              |
 +----------------------------------------------------------------------------+
 |  float __fsqrt_rz ( float x )                                              |
 |                                                                            |
-|  Compute √x in round-towards-zero mode.                                    |
+|  Compute ?x in round-towards-zero mode.                                    |
 +----------------------------------------------------------------------------+
 |  float __log10f ( float x )                                                |
 |                                                                            |
@@ -1179,19 +1179,19 @@ Following is the list of supported floating-point intrinsics. Note that intrinsi
 +----------------------------------------------------------------------------+
 |  double __dsqrt_rd ( double x )                                            |
 |                                                                            |
-|  Compute √x in round-down mode.                                            |
+|  Compute ?x in round-down mode.                                            |
 +----------------------------------------------------------------------------+
 |  double __dsqrt_rn ( double x )                                            |
 |                                                                            |
-|  Compute √x in round-to-nearest-even mode.                                 |
+|  Compute ?x in round-to-nearest-even mode.                                 |
 +----------------------------------------------------------------------------+
 |  double __dsqrt_ru ( double x )                                            |
 |                                                                            |
-|  Compute √x in round-up mode.                                              |
+|  Compute ?x in round-up mode.                                              |
 +----------------------------------------------------------------------------+
 |  double __dsqrt_rz ( double x )                                            |
 |                                                                            |
-|  Compute √x in round-towards-zero mode.                                    |
+|  Compute ?x in round-towards-zero mode.                                    |
 +----------------------------------------------------------------------------+
 
 .. _Texture-Functions:
@@ -1206,7 +1206,7 @@ Texture functions are not supported.
 Surface Functions
 ------------------
 Surface functions are not supported.
-  
+
 .. _Timer-Functions:
 
 Timer Functions
@@ -1217,7 +1217,7 @@ HIP provides the following built-in functions for reading a high-resolution time
 
  clock_t clock()
  long long int clock64()
- 
+
 
 Returns the value of counter that is incremented every clock cycle on device. Difference in values returned provides the cycles used.
 
@@ -1232,65 +1232,65 @@ HIP supports the following atomic operations.
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
 | Function                                                                                                                    | Supported in HIP | Supported in CUDA |
 +=============================================================================================================================+==================+===================+
-| int atomicAdd(int* address, int val)                                                                                        | ✓                | ✓                 |
+| int atomicAdd(int* address, int val)                                                                                        | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicAdd(unsigned int* address,unsigned int val)                                                              | ✓                | ✓                 |
+| unsigned int atomicAdd(unsigned int* address,unsigned int val)                                                              | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val)                                | ✓                | ✓                 |
+| unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val)                                | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| float atomicAdd(float* address, float val)                                                                                  | ✓                | ✓                 |
+| float atomicAdd(float* address, float val)                                                                                  | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| int atomicSub(int* address, int val)                                                                                        | ✓                | ✓                 |
+| int atomicSub(int* address, int val)                                                                                        | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicSub(unsigned int* address,unsigned int val)                                                              | ✓                | ✓                 |
+| unsigned int atomicSub(unsigned int* address,unsigned int val)                                                              | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| int atomicExch(int* address, int val)                                                                                       | ✓                | ✓                 |
+| int atomicExch(int* address, int val)                                                                                       | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicExch(unsigned int* address,unsigned int val)                                                             | ✓                | ✓                 |
+| unsigned int atomicExch(unsigned int* address,unsigned int val)                                                             | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned long long int atomicExch(unsigned long long int* address,unsigned long long int val)                               | ✓                | ✓                 |
+| unsigned long long int atomicExch(unsigned long long int* address,unsigned long long int val)                               | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| float atomicExch(float* address, float val)                                                                                 | ✓                | ✓                 |
+| float atomicExch(float* address, float val)                                                                                 | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| int atomicMin(int* address, int val)                                                                                        | ✓                | ✓                 |
+| int atomicMin(int* address, int val)                                                                                        | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicMin(unsigned int* address,unsigned int val)                                                              | ✓                | ✓                 |
+| unsigned int atomicMin(unsigned int* address,unsigned int val)                                                              | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val)                                | ✓                | ✓                 |
+| unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val)                                | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| int atomicMax(int* address, int val)                                                                                        | ✓                | ✓                 |
+| int atomicMax(int* address, int val)                                                                                        | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicMax(unsigned int* address,unsigned int val)                                                              | ✓                | ✓                 |
+| unsigned int atomicMax(unsigned int* address,unsigned int val)                                                              | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned long long int atomicMax(unsigned long long int* address,unsigned long long int val)                                | ✓                | ✓                 |
+| unsigned long long int atomicMax(unsigned long long int* address,unsigned long long int val)                                | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicInc(unsigned int* address)                                                                               | ✗                | ✓                 |
+| unsigned int atomicInc(unsigned int* address)                                                                               | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicDec(unsigned int* address)                                                                               | ✗                | ✓                 |
+| unsigned int atomicDec(unsigned int* address)                                                                               | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| int atomicCAS(int* address, int compare, int val)                                                                           | ✓                | ✓                 |
+| int atomicCAS(int* address, int compare, int val)                                                                           | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicCAS(unsigned int* address,unsigned int compare,unsigned int val)                                         | ✓                | ✓                 |
+| unsigned int atomicCAS(unsigned int* address,unsigned int compare,unsigned int val)                                         | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned long long int atomicCAS(unsigned long long int* address,unsigned long long int compare,unsigned long long int val) | ✓                | ✓                 |
+| unsigned long long int atomicCAS(unsigned long long int* address,unsigned long long int compare,unsigned long long int val) | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| int atomicAnd(int* address, int val)                                                                                        | ✓                | ✓                 |
+| int atomicAnd(int* address, int val)                                                                                        | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicAnd(unsigned int* address,unsigned int val)                                                              | ✓                | ✓                 |
+| unsigned int atomicAnd(unsigned int* address,unsigned int val)                                                              | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val)                                | ✓                | ✓                 |
+| unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val)                                | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| int atomicOr(int* address, int val)                                                                                         | ✓                | ✓                 |
+| int atomicOr(int* address, int val)                                                                                         | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicOr(unsigned int* address,unsigned int val)                                                               | ✓                | ✓                 |
+| unsigned int atomicOr(unsigned int* address,unsigned int val)                                                               | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned long long int atomicOr(unsigned long long int* address,unsigned long long int val)                                 | ✓                | ✓                 |
+| unsigned long long int atomicOr(unsigned long long int* address,unsigned long long int val)                                 | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| int atomicXor(int* address, int val)                                                                                        | ✓                | ✓                 |
+| int atomicXor(int* address, int val)                                                                                        | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned int atomicXor(unsigned int* address,unsigned int val)                                                              | ✓                | ✓                 |
+| unsigned int atomicXor(unsigned int* address,unsigned int val)                                                              | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
-| unsigned long long int atomicXor(unsigned long long int* address,unsigned long long int val))                               | ✓                | ✓                 |
+| unsigned long long int atomicXor(unsigned long long int* address,unsigned long long int val))                               | ?                | ?                 |
 +-----------------------------------------------------------------------------------------------------------------------------+------------------+-------------------+
 
 .. _Caveats-and-Features-Under-Development:
@@ -1309,20 +1309,20 @@ Warp Cross-Lane Functions
 
 Warp cross-lane functions operate across all lanes in a warp. The hardware guarantees that all warp lanes will execute in lockstep, so additional synchronization is unnecessary, and the instructions use no shared memory.
 
-Note that Nvidia and AMD devices have different warp sizes, so portable code should use the warpSize built-ins to query the warp size. Hipified code from the Cuda path requires careful review to ensure it doesn’t assume a waveSize of 32. "Wave-aware" code that assumes a waveSize of 32 will run on a wave-64 machine, but it will utilize only half of the machine resources. In addition to the warpSize device function, host code can obtain the warpSize from the device properties::
+Note that Nvidia and AMD devices have different warp sizes, so portable code should use the warpSize built-ins to query the warp size. Hipified code from the Cuda path requires careful review to ensure it doesn't assume a waveSize of 32. "Wave-aware" code that assumes a waveSize of 32 will run on a wave-64 machine, but it will utilize only half of the machine resources. In addition to the warpSize device function, host code can obtain the warpSize from the device properties::
 
        cudaDeviceProp props;
        cudaGetDeviceProperties(&props, deviceID);
-     int w = props.warpSize;  
+     int w = props.warpSize;
      // implement portable algorithm based on w (rather than assume 32 or 64)
- 
+
 
 .. _Warp-Vote-and-Ballot-Functions:
 
 Warp Vote and Ballot Functions
 ++++++++++++++++++++++++++++++++
 ::
- 
+
  int __all(int predicate)
  int __any(int predicate)
  uint64_t __ballot(int predicate)
@@ -1334,7 +1334,7 @@ Threads in a warp are referred to as lanes and are numbered from 0 to warpSize -
 * __all() returns 1 if all other warp lanes contribute nonzero predicates, or 0 otherwise
 Applications can test whether the target platform supports the any/all instruction using the hasWarpVote device property or the HIP_ARCH_HAS_WARP_VOTE compiler define.
 
-``__ballot`` provides a bit mask containing the 1-bit predicate value from each lane. The nth bit of the result contains the 1 bit contributed by the nth warp lane. Note that HIP's __ballot function supports a 64-bit return value (compared with Cuda’s 32 bits). Code ported from Cuda should support the larger warp sizes that the HIP version of this instruction supports. Applications can test whether the target platform supports the ballot instruction using the hasWarpBallot device property or the HIP_ARCH_HAS_WARP_BALLOT compiler define.
+``__ballot`` provides a bit mask containing the 1-bit predicate value from each lane. The nth bit of the result contains the 1 bit contributed by the nth warp lane. Note that HIP's __ballot function supports a 64-bit return value (compared with Cuda's 32 bits). Code ported from Cuda should support the larger warp sizes that the HIP version of this instruction supports. Applications can test whether the target platform supports the ballot instruction using the hasWarpBallot device property or the HIP_ARCH_HAS_WARP_BALLOT compiler define.
 
 
 .. _Warp-Shuffle-Functions:
@@ -1343,14 +1343,14 @@ Warp Shuffle Functions
 ++++++++++++++++++++++++
 Half-float shuffles are not supported. The default width is warpSize---see :ref:`Warp Cross-Lane Functions`. Applications should not assume the warpSize is 32 or 64.
 ::
- 
+
  int   __shfl      (int var,   int srcLane, int width=warpSize);
  float __shfl      (float var, int srcLane, int width=warpSize);
  int   __shfl_up   (int var,   unsigned int delta, int width=warpSize);
  float __shfl_up   (float var, unsigned int delta, int width=warpSize);
  int   __shfl_down (int var,   unsigned int delta, int width=warpSize);
  float __shfl_down (float var, unsigned int delta, int width=warpSize) ;
- int   __shfl_xor  (int var,   int laneMask, int width=warpSize) 
+ int   __shfl_xor  (int var,   int laneMask, int width=warpSize)
  float __shfl_xor  (float var, int laneMask, int width=warpSize);
 
 .. _Cooperative Groups Functions:
@@ -1365,88 +1365,88 @@ HIP does not support any of the kernel language cooperative groups
 types or functions.
 
 +--------------------------------------------------------+------------------------+----------------------------+
-|   Function                                             |  Supported in HIP      |    Supported in CUDA       | 
+|   Function                                             |  Supported in HIP      |    Supported in CUDA       |
 +--------------------------------------------------------+------------------------+----------------------------+
-|void thread_group.sync()                                |                        |           y                | 
+|void thread_group.sync()                                |                        |           y                |
 |			                                 |		          |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
-|unsigned thread_group.size()                            |                        |           y                | 
+|unsigned thread_group.size()                            |                        |           y                |
 |			                                 |		          |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |unsigned thread_group.thread_rank()                     |                        |           y                |
-|			                                 |		          |                            | 
+|			                                 |		          |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
-|bool thread_group.is_valid()                            |                        |           y                | 
+|bool thread_group.is_valid()                            |                        |           y                |
 |			                                 |		          |           	               |
 +--------------------------------------------------------+------------------------+----------------------------+
 |thread_group tiled_partiti0on(thread_group, size)       |                        |           y                |
-|			                                 |		          |	      		       |                           
+|			                                 |		          |	      		       |
 +--------------------------------------------------------+------------------------+----------------------------+
 |thread_block_tile<N> tiled_partition<N>(thread_group)   |                        |           y                |
-|			                                 |		          |		  	       |  
+|			                                 |		          |		  	       |
 +--------------------------------------------------------+------------------------+----------------------------+
 |thread_block this_thread_block()                        |                        |           y                |
 |			                                 |		          |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |T thread_block_tile.shfl()                              |                        |           y                |
-|			                                 |         		  |                            |		  
+|			                                 |         		  |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |T thread_block_tile.shfl_down()                         |                        |           y                |
-|					        	 | 	  		  |                            |  
+|					        	 | 	  		  |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |T thread_block_tile.shfl_up()                           |                        |           y                |
 |			                                 |                        |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |T thread_block_tile.shfl_xor()                          |                        |           y                |
-|			                                 |         		  |                            |		  
+|			                                 |         		  |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |T thread_block_tile.any()                               |                        |           y                |
-|					        	 | 	  		  |                            |  
+|					        	 | 	  		  |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |T thread_block_tile.all()                               |                        |           y                |
-|			                                 |         		  |                            |		  
+|			                                 |         		  |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |T thread_block_tile.ballot()                            |                        |           y                |
-|					        	 | 	  		  |                            |  
+|					        	 | 	  		  |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |T thread_block_tile.match_any()                         |                        |           y                |
-|			                                 |         		  |                            |		  
+|			                                 |         		  |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |T thread_block_tile.match_all()                         |                        |           y                |
-|					        	 | 	  		  |                            |  
+|					        	 | 	  		  |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |coalesced_group coalesced_threads()                     |                        |           y                |
-|			                                 |         		  |                            |		  
+|			                                 |         		  |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |grid_group this_grid()                                  |                        |           y                |
-|					        	 | 	  		  |                            |  
+|					        	 | 	  		  |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |void grid_group.sync()                                  |                        |           y                |
-|			                                 |         		  |                            |		  
+|			                                 |         		  |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |unsigned grid_group.size()                              |                        |           y                |
-|					        	 | 	  		  |                            |  
+|					        	 | 	  		  |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |unsigned grid_group.thread_rank()                       |                        |           y                |
-|			                                 |         		  |                            |		  
+|			                                 |         		  |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |bool grid_group.is_valid()                              |                        |           y                |
-|					        	 | 	  		  |                            |  
+|					        	 | 	  		  |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |multi_grid_group this_multi_grid()                      |                        |           y                |
-|					        	 | 	  		  |                            |  
+|					        	 | 	  		  |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |void multi_grid_group.sync()                            |                        |           y                |
-|					        	 | 	  		  |                            |  
+|					        	 | 	  		  |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |unsigned multi_grid_group.size()                        |                        |           y                |
-|					        	 | 	  		  |                            |  
+|					        	 | 	  		  |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |unsigned multi_grid_group.thread_rank()                 |                        |           y                |
-|					        	 | 	  		  |                            |  
+|					        	 | 	  		  |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 |bool multi_grid_group.is_valid()                        |                        |           y                |
-|					        	 | 	  		  |                            |  
+|					        	 | 	  		  |                            |
 +--------------------------------------------------------+------------------------+----------------------------+
 
 
@@ -1462,21 +1462,21 @@ HIP does not support any of the kernel language warp matrix types or functions.
 
 
 +--------------------------------------------------------------------------------------+------------------------+----------------------------+
-|   Function                                                                           |  Supported in HIP      |    Supported in CUD        | 
+|   Function                                                                           |  Supported in HIP      |    Supported in CUD        |
 +--------------------------------------------------------------------------------------+------------------------+----------------------------+
-|void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned lda)                  |                        |             ✓                    | 
+|void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned lda)                  |                        |             ?                    |
 |			                                                               |		        |                            |
 +--------------------------------------------------------------------------------------+------------------------+----------------------------+
-|void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned lda, layout_t layout) |                        |             ✓                    |
+|void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned lda, layout_t layout) |                        |             ?                    |
 |			                                                               |		        |                            |
 +--------------------------------------------------------------------------------------+------------------------+----------------------------+
-|void store_matrix_sync(T* mptr, fragment<...> &a,  unsigned lda, layout_t layout)     |                        |             ✓                    |
+|void store_matrix_sync(T* mptr, fragment<...> &a,  unsigned lda, layout_t layout)     |                        |             ?                    |
 |			                                                               |		        |                            |
 +--------------------------------------------------------------------------------------+------------------------+----------------------------+
-|void fill_fragment(fragment<...> &a, const T &value)                                  |                        |             ✓                    |
+|void fill_fragment(fragment<...> &a, const T &value)                                  |                        |             ?                    |
 |			                                                               |		        |                            |
 +--------------------------------------------------------------------------------------+------------------------+----------------------------+
-|void mma_sync(fragment<...> &d, const fragment<...> &a, const fragment<...> &b,       |                        |             ✓                    |
+|void mma_sync(fragment<...> &d, const fragment<...> &a, const fragment<...> &b,       |                        |             ?                    |
 |const fragment<...> &c , bool sat)                                                    |                        |                            |
 +--------------------------------------------------------------------------------------+------------------------+----------------------------+
 
@@ -1526,14 +1526,14 @@ GPU multiprocessors have a fixed pool of resources (primarily registers and shar
 **hip_launch_bounds** allows the application to provide usage hints that influence the resources (primarily registers) used by the generated code. **hip_launch_bounds** is a function attribute that must be attached to a **global** function::
 
  __global__ void `__launch_bounds__`(MAX_THREADS_PER_BLOCK, MIN_WARPS_PER_EU) MyKernel(...) ...
- MyKernel(hipGridLaunch lp, ...) 
+ MyKernel(hipGridLaunch lp, ...)
  ...
 
 **launch_bounds** supports two parameters:
 
-* MAX_THREADS_PER_BLOCK - The programmers guarantees that kernel will be launched with threads less than 
-  MAX_THREADS_PER_BLOCK. (On NVCC this maps to the .maxntid PTX directive). If no launch_bounds is specified, 
-  MAX_THREADS_PER_BLOCK is the maximum block size supported by the device (typically 1024 or larger). Specifying 
+* MAX_THREADS_PER_BLOCK - The programmers guarantees that kernel will be launched with threads less than
+  MAX_THREADS_PER_BLOCK. (On NVCC this maps to the .maxntid PTX directive). If no launch_bounds is specified,
+  MAX_THREADS_PER_BLOCK is the maximum block size supported by the device (typically 1024 or larger). Specifying
   MAX_THREADS_PER_BLOCK less than the maximum effectively allows the compiler to use more resources than a default unconstrained compilation that supports all possible block sizes at launch time. The threads-per-block is the product of (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z).
 * MIN_WARPS_PER_EU - directs the compiler to minimize resource usage so that the requested number of warps can be simultaneously active on a multi-processor. Since active warps compete for the same fixed pool of resources, the compiler must reduce resources required by each warp(primarily registers). MIN_WARPS_PER_EU is optional and defaults to 1 if not specified. Specifying a MIN_WARPS_PER_EU greater than the default 1 effectively constrains the compiler's resource usage.
 
@@ -1563,13 +1563,13 @@ Porting from CUDA __launch_bounds
 CUDA defines a __launch_bounds which is also designed to control occupancy::
 
  __launch_bounds(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MULTIPROCESSOR)
- 
+
 
 * The second parameter __launch_bounds parameters must be converted to the format used __hip_launch_bounds, which uses warps and execution-units rather than blocks and multi-processors ( This conversion is performed automatically by the clang hipify tools.)
 ::
- 
+
  MIN_WARPS_PER_EXECUTION_UNIT = (MIN_BLOCKS_PER_MULTIPROCESSOR * MAX_THREADS_PER_BLOCK)/32
- 
+
 
 The key differences in the interface are:
 
@@ -1598,15 +1598,15 @@ Unroll with a bounds that is known at compile-time is supported. For example::
 
  #pragma unroll 16 /* hint to compiler to unroll next loop by 16 */
  for (int i=0; i<16; i++) ...
- 
- 
+
+
  #pragma unroll 1  /* tell compiler to never unroll the loop */
  for (int i=0; i<16; i++) ...
 
- 
+
  #pragma unroll /* hint to compiler to completely unroll next loop. */
  for (int i=0; i<16; i++) ...
- 
+
 
 .. _In-Line-Assembly:
 
@@ -1635,12 +1635,12 @@ Kernel Compilation
 hipcc now supports compiling C++/HIP kernels to binary code objects. The user can specify the target for which the binary can be generated. HIP/HCC does not yet support fat binaries so only a single target may be specified. The file format for binary is ``.co`` which means Code Object. The following command builds the code object using **hipcc**.
 ::
 
- hipcc --genco --target-isa=[TARGET GPU] [INPUT FILE] -o [OUTPUT FILE] 
- 
+ hipcc --genco --target-isa=[TARGET GPU] [INPUT FILE] -o [OUTPUT FILE]
+
  [INPUT FILE] = Name of the file containing kernels
- [OUTPUT FILE] = Name of the generated code object file``` 
+ [OUTPUT FILE] = Name of the generated code object file```
 
 
 Note that one important fact to remember when using binary code objects is that the number of arguments to the kernel are different on HCC and NVCC path. Refer to the sample in samples/0_Intro/module_api for differences in the arguments to be passed to the kernel.
- 
+
 
diff --git a/Programming_Guides/LanguageInto.rst b/Programming_Guides/LanguageInto.rst
index a3020cb2..2dec61d0 100644
--- a/Programming_Guides/LanguageInto.rst
+++ b/Programming_Guides/LanguageInto.rst
@@ -14,7 +14,7 @@ problem at hand. Here, we describe some of the options and how to choose among t
 
 HCC: Heterogeneous Compute Compiler
 ####################################
-What is the Heterogeneous Compute (HC) API? It’s a C++ dialect with extensions to launch kernels and manage accelerator memory. It closely tracks the evolution of C++ and will incorporate parallelism and concurrency features as the C++ standard does. For example, HC includes early support for the C++17 Parallel STL. At the recent ISO C++ meetings in Kona and Jacksonville, the committee was excited about enabling the language to express all forms of parallelism, including multicore CPU, SIMD and GPU. We’ll be following these developments closely, and you’ll see HC move quickly to include standard C++ capabilities.
+What is the Heterogeneous Compute (HC) API? It's a C++ dialect with extensions to launch kernels and manage accelerator memory. It closely tracks the evolution of C++ and will incorporate parallelism and concurrency features as the C++ standard does. For example, HC includes early support for the C++17 Parallel STL. At the recent ISO C++ meetings in Kona and Jacksonville, the committee was excited about enabling the language to express all forms of parallelism, including multicore CPU, SIMD and GPU. We'll be following these developments closely, and you'll see HC move quickly to include standard C++ capabilities.
 
 The Heterogeneous Compute Compiler (HCC) provides two important benefits:
 
@@ -29,7 +29,7 @@ The Heterogeneous Compute Compiler (HCC) provides two important benefits:
 
 **Full control over the machine**
 
- * Access AMD scratchpad memories (“LDS”)
+ * Access AMD scratchpad memories ("LDS")
  * Fully control data movement, prefetch and discard
  * Fully control asynchronous kernel launch and completion
  * Get device-side dependency resolution for kernel and data commands (without host involvement)
@@ -44,7 +44,7 @@ performance or control of the machine.
 
 HIP: Heterogeneous-Computing Interface for Portability
 #########################################################
-What is Heterogeneous-Computing Interface for Portability (HIP)? It’s a C++ dialect designed to ease conversion of Cuda applications to portable C++ code. It provides a C-style API and a C++ kernel language. The C++ interface can use templates and classes across the
+What is Heterogeneous-Computing Interface for Portability (HIP)? It's a C++ dialect designed to ease conversion of Cuda applications to portable C++ code. It provides a C-style API and a C++ kernel language. The C++ interface can use templates and classes across the
 host/kernel boundary.
 
 The Hipify tool automates much of the conversion work by performing a source-to-source transformation from Cuda to HIP. HIP code can run on AMD hardware (through the HCC compiler) or Nvidia hardware (through the NVCC compiler) with no performance loss compared with the original Cuda code.
@@ -55,9 +55,9 @@ Programmers familiar with other GPGPU languages will find HIP very easy to learn
 
 Use HIP when converting Cuda applications to portable C++ and for new projects that require portability between AMD and Nvidia. HIP provides a C++ development language and access to the best development tools on both platforms.
 
-**OpenCL™: Open Compute Language**
+**OpenCL(TM): Open Compute Language**
 
-What is OpenCL? It’s a framework for developing programs that can execute across a wide variety of heterogeneous platforms. AMD, Intel
+What is OpenCL? It's a framework for developing programs that can execute across a wide variety of heterogeneous platforms. AMD, Intel
 and Nvidia GPUs support version 1.2 of the specification, as do x86 CPUs and other devices (including FPGAs and DSPs). OpenCL provides a C run-time API and C99-based kernel language.
 
 **When to Use OpenCL**
@@ -67,7 +67,7 @@ Windows, Linux and Mac OS, as well as a wide variety of hardware platforms (desc
 
 **Anaconda Python With Numba**
 
-What is Anaconda? It’s a modern open-source analytics platform powered by Python. Continuum Analytics, a ROCm platform partner,  is the driving force behind it. Anaconda delivers high-performance capabilities including acceleration of HSA APUs, as well as
+What is Anaconda? It's a modern open-source analytics platform powered by Python. Continuum Analytics, a ROCm platform partner,  is the driving force behind it. Anaconda delivers high-performance capabilities including acceleration of HSA APUs, as well as
 ROCm-enabled discrete GPUs via Numba. It gives superpowers to the people who are changing the world.
 
 **Numba**
@@ -81,7 +81,7 @@ Numba works by generating optimized machine code using the LLVM compiler infrast
 
 **When to Use Anaconda**
 
-Use Anaconda when you’re handling large-scale data-analytics,
+Use Anaconda when you're handling large-scale data-analytics,
 scientific and engineering problems that require you to manipulate
 large data arrays.
 
diff --git a/Programming_Guides/Opencl-optimization.rst b/Programming_Guides/Opencl-optimization.rst
index 1fa69e53..df03bf98 100644
--- a/Programming_Guides/Opencl-optimization.rst
+++ b/Programming_Guides/Opencl-optimization.rst
@@ -6,7 +6,7 @@
 OPENCL Optimization
 ========================
 
-.. Note:: Re-Write in Progress to move this to Vega and FIJI/Polaris optimization guide 
+.. Note:: Re-Write in Progress to move this to Vega and FIJI/Polaris optimization guide
 
 Chapter 1 OpenCL Performance and Optimization
 ==============================================
@@ -17,7 +17,7 @@ This chapter discusses performance and optimization when programming for AMD het
 --------------
 AMD's CodeXL is an OpenCL kernel debugging and memory and performance analysis tool that gathers data from the OpenCL run-time and OpenCL devices during the execution of an OpenCL application. This information is used to discover bottlenecks in the application and find ways to optimize the application's performance for AMD platforms.
 
-CodeXL 1.7, the latest version as of this writing, is available as an extension to Microsoft® Visual Studio®, a stand-alone version for Windows, and a stand-alone version for Linux.
+CodeXL 1.7, the latest version as of this writing, is available as an extension to Microsoft(R) Visual Studio(R), a stand-alone version for Windows, and a stand-alone version for Linux.
 
 For a high-level summary of CodeXL features, see Chapter 4 in the AMD OpenCL User Guide. For information about how to use CodeXL to gather performance data about your OpenCL application, such as application traces and timeline views, see the `CodeXL home page <https://gpuopen.com/compute-product/codexl/?webSyncID=aa83689b-1c51-8139-08ba-c72c235854a7&sessionGUID=ab8d35ae-1db8-2ec6-4d4a-290691c91072>`_.
 
@@ -29,7 +29,7 @@ The Timeline View can be useful for debugging your OpenCL application. Examples
 
 For example, the timeline should show that non-dependent kernel executions and data transfer operations occurred simultaneously.
 
-CodeXL also provides information about GPU kernel performance counters. This information can be used to find possible bottlenecks in the kernel execution. You can find the list of performance counters supported by AMD Radeon™ GPUs in the CodeXL documentation. Once the trace data has been used to discover which kernel is most in need of optimization, you can collect the GPU performance counters to drill down into the kernel execution on a GPU device.
+CodeXL also provides information about GPU kernel performance counters. This information can be used to find possible bottlenecks in the kernel execution. You can find the list of performance counters supported by AMD Radeon(TM) GPUs in the CodeXL documentation. Once the trace data has been used to discover which kernel is most in need of optimization, you can collect the GPU performance counters to drill down into the kernel execution on a GPU device.
 
 The Analyze Mode in CodeXL provides the Statistics View, which can be used to gather useful statistics regarding the GPU usage of kernels.
 
@@ -66,21 +66,21 @@ The sample code below shows how to compute the kernel execution time (End- Start
 The CodeXL GPU Profiler also can record the execution time for a kernel automatically. The Kernel Time metric reported in the Profiler output uses the built-in OpenCL timing capability and reports the same result as the
 ``kernelExecTimeNs`` calculation shown above.
 
-Another interesting metric to track is the kernel launch time (Start - Queue). The kernel launch time includes both the time spent in the user application (after enqueuing the command, but before it is submitted to the device), as well as the time spent in the runtime to launch the kernel. For CPU devices, the kernel launch time is fast (tens of 1's), but for discrete GPU devices it can be several hundred Î¼s. Enabling profiling on a command queue adds approximately 10 Î¼s to 40 Î¼s overhead to all clEnqueue calls. Much of the profiling overhead affects the start time; thus, it is visible in the launch time. Be careful when interpreting this metric. To reduce the launch overhead, the AMD OpenCL runtime combines several command submissions into a batch. Commands submitted as batch report similar start times and the same end time.
+Another interesting metric to track is the kernel launch time (Start - Queue). The kernel launch time includes both the time spent in the user application (after enqueuing the command, but before it is submitted to the device), as well as the time spent in the runtime to launch the kernel. For CPU devices, the kernel launch time is fast (tens of 1's), but for discrete GPU devices it can be several hundred I 1/4 s. Enabling profiling on a command queue adds approximately 10 I 1/4 s to 40 I 1/4 s overhead to all clEnqueue calls. Much of the profiling overhead affects the start time; thus, it is visible in the launch time. Be careful when interpreting this metric. To reduce the launch overhead, the AMD OpenCL runtime combines several command submissions into a batch. Commands submitted as batch report similar start times and the same end time.
 
 Measure performance of your test with CPU counters. Do not use OCL profiling. To determine if an application is executed asynchonically, build a dependent execution with OCL events. This is a "generic" solution; however, there is an exception when you can enable profiling and have overlap transfers. DRMDMA engines do not support timestamps ("GPU counters"). To get OCL profiling data, the runtime must synchronize the main command processor (CP) with the DMA engine; this disables overlap. Note, however, that Southern Islands has two independent main CPs and runtime pairs them with DMA engines. So, the application can still execute kernels on one CP, while another is synced with a DRM engine for profiling; this lets you profile it with APP or OCL profiling.
 
 1.2.2 Using the OpenCL timer with Other System Timers
 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 The resolution of the timer, given in ns, can be obtained from::
- 
+
  clGetDeviceInfo(...,CL_DEVICE_PROFILING_TIMER_RESOLUTION...);
 
 AMD CPUs and GPUs report a timer resolution of 1 ns. AMD OpenCL devices are required to correctly track time across changes in frequency and power states. Also, the AMD APP SDK uses the same time-domain for all devices in the platform; thus, the profiling timestamps can be directly compared across the CPU and GPU devices.
 
 The sample code below can be used to read the current value of the OpenCL timer clock. The clock is the same routine used by the AMD OpenCL runtime to generate the profiling timestamps. This function is useful for correlating other program events with the OpenCL profiling timestamps.
 ::
- 
+
  uint64_t timeNanos()
  {
  #ifdef linux
@@ -101,7 +101,7 @@ For more information, see section 5.9, "Profiling Operations on Memory Objects a
 1.2.3 Estimating Memory Bandwidth
 ++++++++++++++++++++++++++++++++++
 The memory bandwidth required by a kernel is perhaps the most important performance consideration. To calculate this:
- 
+
  Effective Bandwidth = (Br + Bw)/T
 
 where:
@@ -119,7 +119,7 @@ Bw = 1 x (1024 x 1024 x 4 bytes) = 4194304 bytes ;; 1 array, 1024x1024, each ele
 
 If the elapsed time for this copy as reported by the profiling timers is 1000000 ns
 (1 million ns, or .001 sec), the effective bandwidth is: (Br+Bw)/T = (8388608+4194304)/1000000 = 12.6GB/s
- 
+
 The CodeXL GPU Profiler can report the number of dynamic instructions per thread that access global memory through the FetchInsts and WriteInsts counters. The Fetch and Write reports average the per-thread counts; these can be fractions if the threads diverge. The Profiler also reports the dimensions of the global NDRange for the kernel in the GlobalWorkSize field. The total number of threads can be determined by multiplying together the three components of the range. If all (or most) global accesses are the same size, the counts from the Profiler and the approximate size can be used to estimate Br and Bw:
 
 Br = Fetch * GlobalWorkitems * Size
@@ -156,7 +156,7 @@ OpenCL uses memory objects to pass data to kernels. These can be either buffers
 * how to control which memory kind is used for a memory object;
 
 * how the runtime maps memory objects for host access;
- 
+
 * how the runtime performs memory object reading, writing and copying;
 
 * how best to use command queues; and
@@ -167,13 +167,13 @@ OpenCL uses memory objects to pass data to kernels. These can be either buffers
 +++++++++++++++++++++++++++++++++++++++++
 
 Memory is used to store memory objects that are accessed by kernels executing on the device, as well as to hold memory object data when they are mapped for access by the host application. This section describes the different memory kinds used by the runtime. Table 1.1 lists the performance of each memory type given
-a PCIe3-capable platform and a high-end AMD Radeon™ 7XXX discrete GPU. In Table 1.1, when host memory is accessed by the GPU shader, it is of type ``CL_MEM_ALLOC_HOST_PTR``. When GPU memory is accessed by the CPU, it is of type ``CL_MEM_PERSISTENT_MEM_AMD``.
+a PCIe3-capable platform and a high-end AMD Radeon(TM) 7XXX discrete GPU. In Table 1.1, when host memory is accessed by the GPU shader, it is of type ``CL_MEM_ALLOC_HOST_PTR``. When GPU memory is accessed by the CPU, it is of type ``CL_MEM_PERSISTENT_MEM_AMD``.
 
 **Table 1.1 Memory Bandwidth in GB/s (R = read, W = write) in GB/s**
 
 
 **Table 2:**
-	
+
 +-------------+---------+---------+--------------+--------------+-----------+-------------+
 |             | CPU R   | GPU W   | GPU Shader R | GPU Shader W | GPU DMA R | GPU DMA W   |
 +=============+=========+=========+==============+==============+===========+=============+
@@ -204,7 +204,7 @@ If the runtime knows the data is in pinned host memory, it can be transferred to
 
 Currently, the runtime recognizes only data that is in pinned host memory for operation arguments that are memory objects it has allocated in pinned host memory. For example, the buffer argument of ``clEnqueueReadBuffer/clEnqueueWriteBuffer`` and ``image`` argument of ``clEnqueueReadImage/clEnqueueWriteImage.`` It does not detect that the ptr arguments of these operations addresses pinned host memory, even if they are the result of ``clEnqueueMapBuffer/clEnqueueMapImage`` on a memory object that is in pinned host memory.
 
-The runtime can make pinned host memory directly accessible from the GPU. Like regular host memory, the CPU uses caching when accessing pinned host memory. For discrete devices, the GPU access to this memory is through the PCIe bus, which also limits bandwidth. For APU devices that do not have the PCIe overhead, GPU access is significantly slower than accessing device-visible host memory (see section 1.3.1.3), which does not use the cache coherency protocol. 
+The runtime can make pinned host memory directly accessible from the GPU. Like regular host memory, the CPU uses caching when accessing pinned host memory. For discrete devices, the GPU access to this memory is through the PCIe bus, which also limits bandwidth. For APU devices that do not have the PCIe overhead, GPU access is significantly slower than accessing device-visible host memory (see section 1.3.1.3), which does not use the cache coherency protocol.
 
 1.3.1.3 Device-Visible Host Memory
 ###################################
@@ -307,7 +307,7 @@ The host application can use ``clEnqueueMapBuffer/clEnqueueMapImage`` to obtain
 1.3.4.1 Zero Copy Memory Objects
 #################################
 ``CL_MEM_USE_PERSISTENT_MEM_AMD``, ``CL_MEM_USE_HOST_PTR,`` and ``CL_MEM_ALLOC_HOST_PTR`` support zero copy memory objects. The first provides device-resident zero copy memory objects, the other two provide host-resident zero copy memory objects.
- 
+
 Zero copy memory objects can be used by an application to optimize data movement. When ``clEnqueueMapBuffer / clEnqueueMapImage / clEnqueueUnmapMemObject`` are used, no runtime transfers are performed, and the operations are very fast; however, the runtime can return a different pointer value each time a zero copy memory object is mapped. Note that only images created with ``CL_MEM_USE_PERSISTENT_MEM_AMD`` can be zero copy.
 
 From Southern Island on, devices support zero copy memory objects under Linux; however, only images created with ``CL_MEM_USE_PERSISTENT_MEM_AMD`` can be zero copy.
@@ -363,11 +363,11 @@ For Southern Islands and later, devices support at least two hardware compute qu
 
 An OpenCL queue is assigned to a hardware queue on creation time. The hardware compute queues are selected according to the creation order within an OpenCL context. If the hardware supports K concurrent hardware queues, the Nth created OpenCL queue within a specific OpenCL context will be assigned to the (N mod K) hardware queue. The number of compute queues can be limited by specifying the ``GPU_NUM_COMPUTE_RINGS`` environment variable.
 
-Devices in the Sea Islands and Volcanic Islands families contain between four and eight ACEs, and are multi-threaded (thereby supporting more hardware queues), so they offer more performance. For example, the AMD Radeon™ R9290X devices, in the VI family contain 8 ACEs and 44 CUs.
- 
+Devices in the Sea Islands and Volcanic Islands families contain between four and eight ACEs, and are multi-threaded (thereby supporting more hardware queues), so they offer more performance. For example, the AMD Radeon(TM) R9290X devices, in the VI family contain 8 ACEs and 44 CUs.
+
 1.3.6.1 A note on hardware queues
 #################################
-A hardware queue can be thought of as a GPU entry point. The GPU can process kernels from several compute queues concurrently. All hardware queues ultimately share the same compute cores. The use of multiple hardware queues is beneficial when launching small kernels that do not fully saturate the GPU. For example, the AMD Radeon™ HD 290X compute device can execute up to 112,640 threads concurrently. The GPU can execute two kernels each spawning 56320 threads (assuming fully occupancy) twice as fast if launched concurrently through two hardware queues than serially through a single hardware queue.
+A hardware queue can be thought of as a GPU entry point. The GPU can process kernels from several compute queues concurrently. All hardware queues ultimately share the same compute cores. The use of multiple hardware queues is beneficial when launching small kernels that do not fully saturate the GPU. For example, the AMD Radeon(TM) HD 290X compute device can execute up to 112,640 threads concurrently. The GPU can execute two kernels each spawning 56320 threads (assuming fully occupancy) twice as fast if launched concurrently through two hardware queues than serially through a single hardware queue.
 
 1.4 OpenCL Data Transfer Optimization
 --------------------------------------
@@ -377,7 +377,7 @@ The AMD OpenCL implementation offers several optimized paths for data transfer t
 ++++++++++++++++++
 
  * *Deferred allocation* - The CL runtime attempts to minimize resource consumption by delaying buffer allocation until first use. As a side effect, the first accesses to a buffer may be more expensive than subsequent accesses.
- * *Peak interconnect bandwidth* - As used in the text below, this is the transfer bandwidth between host and device that is available under optimal conditions at the application level. It is dependent on the type of interconnect, the chipset, and the graphics chip. As an example, a high-performance PC with a PCIe 3.0 16x bus and a GCN architecture (AMD Radeon™ HD 7XXX series) graphics card has a nominal interconnect bandwidth of 16 GB/s.
+ * *Peak interconnect bandwidth* - As used in the text below, this is the transfer bandwidth between host and device that is available under optimal conditions at the application level. It is dependent on the type of interconnect, the chipset, and the graphics chip. As an example, a high-performance PC with a PCIe 3.0 16x bus and a GCN architecture (AMD Radeon(TM) HD 7XXX series) graphics card has a nominal interconnect bandwidth of 16 GB/s.
  * *Pinning* - When a range of host memory is prepared for transfer to the GPU, its pages are locked into system memory. This operation is called pinning; it can impose a high cost, proportional to the size of the memory range. One of the goals of optimizing data transfer is to use pre-pinned buffers whenever possible. However, if pre-pinned buffers are used excessively, it can reduce the available system memory and result in excessive swapping. Host side zero copy buffers provide easy access to pre- pinned memory.
  * *WC* - Write Combine is a feature of the CPU write path to a select region of the address space. Multiple adjacent writes are combined into cache lines (for example, 64 bytes) before being sent to the external bus. This path typically provides fast streamed writes, but slower scattered writes. Depending on the chip set, scattered writes across a graphics interconnect can be very slow. Also, some platforms require multi-core CPU writes to saturate the WC path over an interconnect.
  * *Uncached accesses* - Host memory and I/O regions can be configured as uncached. CPU read accesses are typically very slow; for example: uncached CPU reads of graphics memory over an interconnect.
@@ -404,16 +404,16 @@ If a given platform supports the zero copy feature, the following buffer types a
  * The CL_MEM_ALLOC_HOST_PTR and CL_MEM_USE_HOST_PTR buffers are:
 
 	* zero copy buffers that resides on the host.
-	
+
 	* directly accessible by the host at host memory bandwidth.
 
 	* directly accessible by the device across the interconnect.
 
 	* a pre-pinned sources or destinations for CL read, write, and copy commands into device memory at peak interconnect bandwidth.
- 
+
 
 Note that buffers created with the flag CL_MEM_ALLOC_HOST_PTR together with CL_MEM_READ_ONLY may reside in uncached write-combined memory. As a result, CPU can have high streamed write bandwidth, but low read and potentially low write scatter bandwidth, due to the uncached WC path.
- 
+
  * The CL_MEM_USE_PERSISTENT_MEM_AMD buffer is
 
 	* a zero copy buffer that resides on the GPU device.
@@ -439,7 +439,7 @@ Zero copy buffers work well on APU devices. SDK 2.5 introduced an optimization t
 As this memory is not cacheable, CPU read operations are very slow. This type of buffer also exists on discrete platforms, but transfer performance typically is limited by PCIe bandwidth.
 
 Zero copy buffers can provide low latency for small transfers, depending on the transfer path. For small buffers, the combined latency of map/CPU memory access/unmap can be smaller than the corresponding DMA latency.
- 
+
 
 1.4.2.3 Pre-pinned Buffers
 ############################
@@ -467,12 +467,12 @@ From an application point of view, two fundamental use cases exist, and they can
 Note that the OpenCL runtime uses deferred allocation to maximize memory resources. This means that a complete roundtrip chain, including data transfer and kernel compute, might take one or two iterations to reach peak performance.
 
 A code sample named BufferBandwidth can be used to investigate and benchmark the various transfer options in combination with different buffer types.
- 
+
 
 **Option 1** - clEnqueueWriteBuffer() and clEnqueueReadBuffer().
  This option is the easiest to use on the application side.  *CL_MEM_USE_HOST_PTR*  is an ideal choice if the application wants to transfer a buffer that has already been allocated through ``malloc( )`` or ``mmap( )``.
- There are two ways to use this option. The first uses clEnqueueRead/WriteBuffer on a pre-pinned, mapped host-side buffer: 
- 
+ There are two ways to use this option. The first uses clEnqueueRead/WriteBuffer on a pre-pinned, mapped host-side buffer:
+
   a. pinnedBuffer = clCreateBuffer ( CL_MEM_ALLOC_HOST_PTR or CL_MEM_USE_HOST_PTR )
   b. deviceBuffer = clCreateBuffer( )
   c. void *pinnedMemory = clEnqueueMapBuffer (pinnedBuffer)
@@ -486,7 +486,7 @@ A code sample named BufferBandwidth can be used to investigate and benchmark the
 **Option 2** - clEnqueueCopyBuffer() on a pre-pinned host buffer (requires pre-pinned buffer support)
 
  This is analogous to Option 1. Performing a CL copy of a pre-pinned buffer to a device buffer (or vice versa) runs at peak interconnect bandwidth.
- 
+
 	a.  pinnedBuffer = clCreateBuffer( CL_MEM_ALLOC_HOST_PTR or CL_MEM_USE_HOST_PTR )
 	b.  deviceBuffer = clCreateBuffer() *This is followed either by :*
 	c.  void *memory = clEnqueueMapBuffer ( pinnedBuffer )
@@ -509,45 +509,45 @@ A code sample named BufferBandwidth can be used to investigate and benchmark the
  The transfer sequence is as follows:
 
  	a.  Data transfer from host to device buffer.
- 	
+
 		1. ptr = clEnqueueMapBuffer( .., buf, .., CL_MAP_WRITE, ..) Since the buffer is mapped write-only, no data is transferred from device buffer to host. The map operation is very low cost. A pointer to a pinned host buffer is returned.
 		2. The application fills in the host buffer through memset( ptr ), memcpy ( ptr, srcptr ), fread( ptr ), or direct CPU writes. This happens at host memory bandwidth.
 		3. clEnqueueUnmapMemObject( .., buf, ptr, .. ) The pre-pinned buffer is transferred to the GPU device, at peak interconnect bandwidth.
-     
+
  	b.  Data transfer from device buffer to host.
- 	 	
-		1. ptr = clEnqueueMapBuffer(.., buf, .., CL_MAP_READ, .. ) 
+
+		1. ptr = clEnqueueMapBuffer(.., buf, .., CL_MAP_READ, .. )
 		This command triggers a transfer from the device to host memory, into a pre-pinned temporary buffer, at peak interconnect bandwidth. A pointer to the pinned memory is returned.
 		2. The application reads and processes the data, or executes a memcpy( dstptr, ptr ), fwrite (ptr),
 		or similar function. Since the buffer resides in host memory, this happens at host memory bandwidth.
  		3. clEnqueueUnmapMemObject( .., buf, ptr, .. )
- 	  
+
  	  Since the buffer was mapped as read-only, no transfer takes place, and the unmap operation is very low cost.
 
 **Option 4** - Direct host access to a zero copy device buffer (requires zero copy support)
 
  This option allows overlapping of data transfers and GPU compute. It is also useful for sparse write updates under certain constraints.
- 
+
 
  	a.  A zero copy buffer on the device is created using the following command: buf = clCreateBuffer ( .., CL_MEM_USE_PERSISTENT_MEM_AMD, ..)
               This buffer can be directly accessed by the host CPU, using the uncached WC path. This can take place at the same time the GPU executes a compute kernel. A common double buffering scheme has the kernel process data from one buffer while the CPU fills a second buffer. See the TransferOverlap code sample.
               A zero copy device buffer can also be used to for sparse updates, such as assembling sub-rows of a larger matrix into a smaller, contiguous block for GPU processing. Due to the WC path, it is a good design choice to try to align writes to the cache line size, and to pick the write block size as large as possible.
 
  	b.  Transfer from the host to the device.
- 	
+
    	  1.ptr = clEnqueueMapBuffer( .., buf, .., CL_MAP_WRITE, .. )
-   	  
+
        	     This operation is low cost because the zero copy device buffer is directly mapped into the host address space.
 
    	  2.The application transfers data via memset( ptr ), memcpy( ptr, srcptr ), or direct CPU writes.
        	     The CPU writes directly across the interconnect into the zero copy device buffer. Depending on the chipset, the bandwidth can be of the same order of magnitude as the interconnect bandwidth, although it typically is lower than peak.
    	  3.clEnqueueUnmapMemObject ( .., buf, ptr, .. )
-   	  
+
 	     As with the preceding map, this operation is low cost because the buffer continues to reside on the device.
-       	     
+
  	c.  If the buffer content must be read back later, use clEnqueueReadBuffer( .., buf, ..) or clEnqueueCopyBuffer( .., buf, zero copy host buffer, .. )
- 	
-     	    This bypasses slow host reads through the uncached path. 
+
+     	    This bypasses slow host reads through the uncached path.
 
 **Option 5** - Direct GPU access to a zero copy host buffer (requires zero copy support)
 
@@ -557,14 +557,14 @@ A code sample named BufferBandwidth can be used to investigate and benchmark the
  	    buf = clCreateBuffer( .., CL_MEM_ALLOC_HOST_PTR, .. )
 
  	b.  Next the application modifies or reads the zero copy host buffer.
- 	
+
  	  1. ptr = clEnqueueMapBuffer( .., buf, .., CL_MAP_READ | CL_MAP_WRITE, .. )
  	     This operation is very low cost because it is a map of a buffer already residing in host memory.
  	  2. The application modifies the data through ``memset( ptr )``, ``memcpy`` (in either direction), sparse or dense CPU reads or writes. Since the application is modifying a host buffer, these operations take place at host memory bandwidth.
  	  3. clEnqueueUnmapMemObject( .., buf, ptr, .. )
- 	  
+
  	     As with the preceding map, this operation is very low cost because the buffer continues to reside in host memory.
- 	     
+
  	c.  The application runs clEnqueueNDRangeKernel(), using buffers of this type as input or output. GPU kernel reads and writes go across the interconnect to host memory, and the data transfer becomes part of the kernel execution.
  	    The achievable bandwidth depends on the platform and chipset, but can be of the same order of magnitude as the peak interconnect bandwidth. For discrete graphics cards, it is important to note that resulting GPU kernel bandwidth is an order of magnitude lower compared to a kernel accessing a regular device buffer located on the device.
 
@@ -576,17 +576,17 @@ The AMD OpenCL runtime supports both CPU and GPU devices. This section introduce
 
 1.5.1 CPU and GPU Devices
 +++++++++++++++++++++++++++++
-Table 1.1 lists some key performance characteristics of two exemplary CPU and GPU devices: a quad-core AMD Phenom II X4 processor running at 2.8 GHz, and a mid-range AMD Radeon™ HD 7770 GPU running at 1 GHz. The "best" device in each characteristic is highlighted, and the ratio of the best/other device is shown in the final column.
+Table 1.1 lists some key performance characteristics of two exemplary CPU and GPU devices: a quad-core AMD Phenom II X4 processor running at 2.8 GHz, and a mid-range AMD Radeon(TM) HD 7770 GPU running at 1 GHz. The "best" device in each characteristic is highlighted, and the ratio of the best/other device is shown in the final column.
 
 The GPU excels at high-throughput: the peak execution rate (measured in FLOPS) is 7X higher than the CPU, and the memory bandwidth is 2.5X higher than the CPU. The GPU also consumes approximately 65% the power of the CPU; thus, for this comparison, the power efficiency in flops/watt is 10X higher. While power efficiency can vary significantly with different devices, GPUs generally provide greater power efficiency (flops/watt) than CPUs because they optimize for throughput and eliminate hardware designed to hide latency.
- 
+
 
 	**Table 1.1 CPU and GPU Performance Characteristics**
 
 +------------------------------------+-------------------+---------------------+--------------+
 |                                    | CPU               | GPU                 | Winner Ratio |
 +====================================+===================+=====================+==============+
-| Example Device                     | AMD Phenom™ II X4 | AMD Radeon™ HD 7770 |              |
+| Example Device                     | AMD Phenom(TM) II X4 | AMD Radeon(TM) HD 7770 |              |
 +------------------------------------+-------------------+---------------------+--------------+
 | Core Frequency                     | 2800 MHz          | 1 GHz               | 3 X          |
 +------------------------------------+-------------------+---------------------+--------------+
@@ -618,11 +618,11 @@ The GPU excels at high-throughput: the peak execution rate (measured in FLOPS) i
 +------------------------------------+-------------------+---------------------+--------------+
 |                                    |                   |                     |              |
 +------------------------------------+-------------------+---------------------+--------------+
-| Approx Kernel Launch Latency       | 25 μs             | 50 μs               | 2 X          |
+| Approx Kernel Launch Latency       | 25 us             | 50 us               | 2 X          |
 +------------------------------------+-------------------+---------------------+--------------+
 
 
-.. [1] For the power specifications of the AMD Phenom™ II x4, see http://www.amd.com/us/products/desktop/processors/phenom-ii/Pages/phenom-ii-model-number-comparison.aspx .
+.. [1] For the power specifications of the AMD Phenom(TM) II x4, see http://www.amd.com/us/products/desktop/processors/phenom-ii/Pages/phenom-ii-model-number-comparison.aspx .
 
 
 Table 4.5 provides a comparison of the CPU and GPU performance charac- teristics in an AMD A8-4555M "Trinity" APU (19 W, 21 GB/s memory bandwidth).
@@ -655,7 +655,7 @@ Table 4.5 provides a comparison of the CPU and GPU performance charac- teristics
 
 
 
-Conversely, CPUs excel at latency-sensitive tasks. For example, an integer add is 10X faster on the CPU than on the GPU. This is a product of both the CPUs higher clock rate (2800 MHz vs 1000 MHz for this comparison), as well as the operation latency; the CPU is optimized to perform an integer add in just one cycle, while the GPU requires four cycles. The CPU also has a latency-optimized path to DRAM, while the GPU optimizes for bandwidth and relies on many in- flight threads to hide the latency. The AMD Radeon™ HD 7770 GPU, for example, supports more than 25,000 in-flight work-items and can switch to a new wavefront (containing up to 64 work-items) in a single cycle. The CPU supports only four hardware threads, and thread-switching requires saving and restoring the CPU registers from memory. The GPU requires many active threads to both keep the execution resources busy, as well as provide enough threads to hide the long latency of cache misses.
+Conversely, CPUs excel at latency-sensitive tasks. For example, an integer add is 10X faster on the CPU than on the GPU. This is a product of both the CPUs higher clock rate (2800 MHz vs 1000 MHz for this comparison), as well as the operation latency; the CPU is optimized to perform an integer add in just one cycle, while the GPU requires four cycles. The CPU also has a latency-optimized path to DRAM, while the GPU optimizes for bandwidth and relies on many in- flight threads to hide the latency. The AMD Radeon(TM) HD 7770 GPU, for example, supports more than 25,000 in-flight work-items and can switch to a new wavefront (containing up to 64 work-items) in a single cycle. The CPU supports only four hardware threads, and thread-switching requires saving and restoring the CPU registers from memory. The GPU requires many active threads to both keep the execution resources busy, as well as provide enough threads to hide the long latency of cache misses.
 
 Each GPU wavefront has its own register state, which enables the fast single- cycle switching between threads. Also, GPUs can be very efficient at gather/scatter operations: each work-item can load from any arbitrary address, and the registers are completely decoupled from the other threads. This is substantially more flexible and higher-performing than a classic Vector ALU-style architecture (such as SSE on the CPU), which typically requires that data be accessed from contiguous and aligned memory locations. SSE supports instructions that write parts of a register (for example, MOVLPS and MOVHPS, which write the upper and lower halves, respectively, of an SSE register), but these instructions generate additional microarchitecture dependencies and frequently require additional pack instructions to format the data correctly.
 
@@ -682,7 +682,7 @@ Usually, when the data size is small, it is faster to use the CPU because the st
 By design, each OpenCL command queue can only schedule work on a single OpenCL device. Thus, using multiple devices requires the developer to create a separate queue for each device, then partition the work between the available command queues.
 
 A simple scheme for partitioning work between devices would be to statically determine the relative performance of each device, partition the work so that faster devices received more work, launch all the kernels, and then wait for them to complete. In practice, however, this rarely yields optimal performance. The relative performance of devices can be difficult to determine, in particular for kernels whose performance depends on the data input. Further, the device performance can be affected by dynamic frequency scaling, OS thread scheduling decisions, or contention for shared resources, such as shared caches and DRAM bandwidth. Simple static partitioning algorithms which "guess wrong" at the beginning can result in significantly lower performance, since some devices finish and become idle while the whole system waits for the single, unexpectedly slow device.
- 
+
 For these reasons, a dynamic scheduling algorithm is recommended. In this approach, the workload is partitioned into smaller parts that are periodically scheduled onto the hardware. As each device completes a part of the workload, it requests a new part to execute from the pool of remaining work. Faster devices, or devices which work on easier parts of the workload, request new input faster, resulting in a natural workload balancing across the system. The approach creates some additional scheduling and kernel submission overhead, but dynamic scheduling generally helps avoid the performance cliff from a single bad initial scheduling decision, as well as higher performance in real-world system environments (since it can adapt to system conditions as the algorithm runs).
 
 Multi-core runtimes, such as Cilk, have already introduced dynamic scheduling algorithms for multi-core CPUs, and it is natural to consider extending these scheduling algorithms to GPUs as well as CPUs. A GPU introduces several new aspects to the scheduling process:
@@ -723,55 +723,55 @@ The AMD OpenCL implementation spawns a new thread to manage each command queue.
 
 For low-latency CPU response, it can be more efficient to use a dedicated spin loop and not call clFinish() Calling clFinish() indicates that the application wants to wait for the GPU, putting the thread to sleep. For low latency, the application should use ``clFlush()``, followed by a loop to wait for the event to complete. This is also true for blocking maps. The application should use non- blocking maps followed by a loop waiting on the event. The following provides sample code for this.
 ::
- 
+
  if (sleep)
- 
+
  {
  // this puts host thread to sleep, useful if power is a consideration or overhead is not a concern
 
  ``clFinish`` (cmd_queue_);
- 
+
  }
 
  else
- 
+
  {
- 
+
  // this keeps the host thread awake, useful if latency is a concern
- 
+
  clFlush(cmd_queue_);
- 
+
  error_ = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
  sizeof(cl_int), &eventStatus, NULL);
- 
+
  while (eventStatus > 0)
- 
+
  {
- 
+
  error_ = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
  sizeof(cl_int), &eventStatus, NULL);
-  
+
  to find
- 
+
  Sleep(0);	// be nice to other threads, allow scheduler
 
  other work if possible
- 
+
  // Choose your favorite way to yield, SwitchToThread()
  for example,
  in place of Sleep(0)
- 
+
  }
- 
+
  }
- 
+
 
 1.5.5	GPU and CPU Kernels
 +++++++++++++++++++++++++++++
 
 While OpenCL provides functional portability so that the same kernel can run on any device, peak performance for each device is typically obtained by tuning the OpenCL kernel for the target device.
 
-Code optimized for the Tahiti device (the AMD Radeon™ HD 7970 GPU) typically runs well across other members of the Southern Islands family.
+Code optimized for the Tahiti device (the AMD Radeon(TM) HD 7970 GPU) typically runs well across other members of the Southern Islands family.
 
 CPUs and GPUs have very different performance characteristics, and some of these impact how one writes an optimal kernel. Notable differences include:
 
@@ -788,7 +788,7 @@ Another approach is to leverage a CPU-targeted routine written in a standard hig
 ++++++++++++++++++++++++++++
 
 The AMD OpenCL program creates at least one context, and each context can contain multiple devices. Thus, developers must choose whether to place all devices in the same context or create a new context for each device. Generally, it is easier to extend a context to support additional devices rather than duplicating the context for each device: buffers are allocated at the context level (and automatically across all devices), programs are associated with the context, and kernel compilation (via ``clBuildProgram``) can easily be done for all devices in a context. However, with current OpenCL implementations, creating a separate context for each device provides more flexibility, especially in that buffer allocations can be targeted to occur on specific devices. Generally, placing the devices in the same context is the preferred solution.
- 
+
 
 
 Chapter 2 OpenCL Performance and Optimiza- tion for GCN Devices
@@ -802,9 +802,9 @@ The GPU consists of multiple compute units. Each compute unit (CU) contains loca
 
 Each compute unit contains 64 kB local memory, 16 kB of read/write L1 cache, four vector units, and one scalar unit. The maximum local memory allocation is 32 kB per work-group. Each vector unit contains 512 scalar registers (SGPRs) for handling branching, constants, and other data constant across a wavefront. Vector units also contain 256 vector registers (VGPRs). VGPRs actually are scalar registers, but they are replicated across the whole wavefront. Vector units contain 16 processing elements (PEs). Each PE is scalar.
 
-Since the L1 cache is 16 kB per compute unit, the total L1 cache size is 16 kB * (# of compute units). For the AMD Radeon™ HD 7970, this means a total of 512 kB L1 cache. L1 bandwidth can be computed as:
+Since the L1 cache is 16 kB per compute unit, the total L1 cache size is 16 kB * (# of compute units). For the AMD Radeon(TM) HD 7970, this means a total of 512 kB L1 cache. L1 bandwidth can be computed as:
 L1 peak bandwidth = Compute Units * (4 threads/clock) * (128 bits per thread) * (1 byte / 8 bits) * Engine Clock
-For the AMD Radeon™ HD 7970, this is ~1.9 TB/s.
+For the AMD Radeon(TM) HD 7970, this is ~1.9 TB/s.
 
 If two memory access requests are directed to the same controller, the hardware serializes the access. This is called a channel conflict. Similarly, if two memory access requests go to the same memory bank, hardware serializes the access. This is called a bank conflict. From a developer's point of view, there is not much difference between channel and bank conflicts. Often, a large power of two stride results in a channel conflict. The size of the power of two stride that causes a specific type of conflict depends on the chip. A stride that results in a channel conflict on a machine with eight channels might result in a bank conflict on a machine with four.
 
@@ -827,7 +827,7 @@ When the application has complete control of the access pattern and address gene
 In this example:
 ::
 
- for (ptr=base; ptr<max; ptr += 16KB) 
+ for (ptr=base; ptr<max; ptr += 16KB)
  R0 = *ptr;
 
 where the lower bits are all the same, the memory requests all access the same bank on the same channel and are processed serially.
@@ -841,36 +841,36 @@ The hardware byte address bits are :
 +---------+---------------------+----------+--------------+
 
 
-* On all AMD Radeon™ HD 79XX-series GPUs, there are 12 channels. A crossbar distributes the load to the appropriate memory channel. Each memory channel has a read/write global L2 cache, with 64 kB per channel. The cache line size is 64 bytes.
+* On all AMD Radeon(TM) HD 79XX-series GPUs, there are 12 channels. A crossbar distributes the load to the appropriate memory channel. Each memory channel has a read/write global L2 cache, with 64 kB per channel. The cache line size is 64 bytes.
 
-Because 12 channels are not a part of the power of two memory and bank channel addressing, this is not straightforward for the AMD Radeon™ HD 79XX series. The memory channels are grouped in four quadrants, each which consisting of three channels. Bits 8, 9, and 10 of the address select a "virtual pipe." The top two bits of this pipe select the quadrant; then, the channel within the quadrant is selected using the low bit of the pipe and the row and bank address modulo three, according to the following conditional equation.
+Because 12 channels are not a part of the power of two memory and bank channel addressing, this is not straightforward for the AMD Radeon(TM) HD 79XX series. The memory channels are grouped in four quadrants, each which consisting of three channels. Bits 8, 9, and 10 of the address select a "virtual pipe." The top two bits of this pipe select the quadrant; then, the channel within the quadrant is selected using the low bit of the pipe and the row and bank address modulo three, according to the following conditional equation.
 ::
- 
+
  If (({ row, bank} %3) == 1)
  channel_within_quadrant = 1
  else
  channel_within_quadrant = 2 * pipe[0]
- 
- 
+
+
 Figure 2.1 illustrates the memory channel mapping.
- 
- 
+
+
 .. image:: Opencl_optimization_images/2.1.png
 
 **Figure 2.1 Channel Remapping/Interleaving**
 
 Note that an increase of the address by 2048 results in a 1/3 probability the same channel is hit; increasing the address by 256 results in a 1/6 probability the same channel is hit, etc.
- 
-On AMD Radeon™ HD 78XX GPUs, the channel selection are bits 10:8 of the byte address. For the AMD Radeon™ HD 77XX, the channel selection are bits 9:8 of the byte address. This means a linear burst switches channels every 256 bytes. Since the wavefront size is 64, channel conflicts are avoided if each work- item in a wave reads a different address from a 64-word region. All AMD Radeon™ HD 7XXX series GPUs have the same layout: channel ends at bit 8, and the memory bank is to the left of the channel.
+
+On AMD Radeon(TM) HD 78XX GPUs, the channel selection are bits 10:8 of the byte address. For the AMD Radeon(TM) HD 77XX, the channel selection are bits 9:8 of the byte address. This means a linear burst switches channels every 256 bytes. Since the wavefront size is 64, channel conflicts are avoided if each work- item in a wave reads a different address from a 64-word region. All AMD Radeon(TM) HD 7XXX series GPUs have the same layout: channel ends at bit 8, and the memory bank is to the left of the channel.
 
 For AMD Radeon HD 77XX and 78XX GPUs, a burst of 2 kB (# of channels *
 256 bytes) cycles through all the channels.
 
-For AMD Radeon™ HD 77XX and 78XX GPUs, when calculating an address as y*width+x, but reading a burst on a column (incrementing y), only one memory channel of the system is used, since the width is likely a multiple of 256 words = 2048 bytes. If the width is an odd multiple of 256B, then it cycles through all channels.
+For AMD Radeon(TM) HD 77XX and 78XX GPUs, when calculating an address as y*width+x, but reading a burst on a column (incrementing y), only one memory channel of the system is used, since the width is likely a multiple of 256 words = 2048 bytes. If the width is an odd multiple of 256B, then it cycles through all channels.
 
 If every work-item in a work-group references consecutive memory addresses and the address of work-item 0 is aligned to 256 bytes and each work-item fetches 32 bits, the entire wavefront accesses one channel. Although this seems slow, it actually is a fast pattern because it is necessary to consider the memory access over the entire device, not just a single wavefront.
 
-One or more work-groups execute on each compute unit. On the AMD Radeon™ HD 7000-series GPUs, work-groups are dispatched in a linear order, with x changing most rapidly. 
+One or more work-groups execute on each compute unit. On the AMD Radeon(TM) HD 7000-series GPUs, work-groups are dispatched in a linear order, with x changing most rapidly.
 For a single dimension, this is:
 
  **DispatchOrder = get_group_id(0)**
@@ -882,16 +882,16 @@ For two dimensions, this is:
 This is row-major-ordering of the blocks in the index space. Once all compute units are in use, additional work-groups are assigned to compute units as needed. Work-groups retire in order, so active work-groups are contiguous.
 
 At any time, each compute unit is executing an instruction from a single wavefront. In memory intensive kernels, it is likely that the instruction is a
-memory access. Since there are 12 channels on the AMD Radeon™ HD 7970
+memory access. Since there are 12 channels on the AMD Radeon(TM) HD 7970
 GPU, at most 12 of the compute units can issue a memory access operation in one cycle. It is most efficient if the accesses from 12 wavefronts go to different channels. One way to achieve this is for each wavefront to access consecutive groups of 256 = 64 * 4 bytes. Note, as shown in Figure 2.1, fetching 256 * 12 bytes in a row does not always cycle through all channels.
 
 An inefficient access pattern is if each wavefront accesses all the channels. This is likely to happen if consecutive work-items access data that has a large power of two strides.
 
 In the next example of a kernel for copying, the input and output buffers are interpreted as though they were 2D, and the work-group size is organized as 2D.
- 
+
 
 The kernel code is::
- 
+
  #define WIDTH 1024
  #define DATA_TYPE float
  #define A(y , x ) A[ (y) * WIDTH + (x ) ]
@@ -903,9 +903,9 @@ The kernel code is::
  int idx = get_global_id(0);
  int idy = get_global_id(1);
  C(idy, idx) = A( idy, idx);
- }  
+ }
 
-By changing the width, the data type and the work-group dimensions, we get a set of kernels out of this code. 
+By changing the width, the data type and the work-group dimensions, we get a set of kernels out of this code.
 
 Given a 64x1 work-group size, each work-item reads a consecutive 32-bit address. Given a 1x64 work-group size, each work-item reads a value separated by the width in a power of two bytes.
 
@@ -929,7 +929,7 @@ Figure 2.2 illustrates the transformation to staggered offsets.
 
 .. [1] Generally, it is not a good idea to make the work-group size something other than an integer multiple of the wavefront size, but that usually is less important than avoiding channel conflicts.
 
- 
+
 
 The global ID values reflect the order that the hardware initiates work-groups. The values of get group ID are in ascending launch order.
 ::
@@ -950,11 +950,11 @@ By introducing a transformation, it is possible to stagger the work-groups to av
 3. ``get_global_id(0)`` - the x coordinate or the column of the matrix.
 
 4. ``get_global_id(1)`` - the y coordinate or the row of the matrix.
- 
+
 
 To transform the code, add the following four lines to the top of the kernel.
 ::
- 
+
  get_group_id_0 = get_group_id(0);
  get_group_id_1 = (get_group_id(0) + get_group_id(1)) % get_local_size(0);
  get_global_id_0 = get_group_id_0 * get_local_size(0) + get_local_id(0);
@@ -969,18 +969,18 @@ Then, change the global IDs and group IDs to the staggered form. The result is:
  size_t get_group_id_0 = get_group_id(0);
  size_t get_group_id_1 = (get_group_id(0) + get_group_id(1)) %
  get_local_size(0);
-  
+
  size_t get_global_id_0 = get_group_id_0 * get_local_size(0) +
  get_local_id(0);
  size_t get_global_id_1 = get_group_id_1 * get_local_size(1) +
- get_local_id(1); 
- 
- int idx = get_global_id_0; //changed to staggered form int idy = get_global_id_1; //changed to staggered form 
- 
+ get_local_id(1);
+
+ int idx = get_global_id_0; //changed to staggered form int idy = get_global_id_1; //changed to staggered form
+
  C(idy , idx) = A( idy , idx);
  }
 
- 
+
 2.1.1.2 Reads Of The Same Address
 ##################################
 Under certain conditions, one unexpected case of a channel conflict is that reading from the same address is a conflict, even on the FastPath.
@@ -989,16 +989,16 @@ This does not happen on the read-only memories, such as constant buffers, textur
 
 From a hardware standpoint, reads from a fixed address have the same upper bits, so they collide and are serialized. To read in a single value, read the value in a single work-item, place it in local memory, and then use that location:
 
- 
+
 Avoid::
- 
+
  temp = input[3] // if input is from global space
 
 Use::
- 
+
   if (get_local_id(0) == 0) {
   local = input[3]
-   
+
   }
   barrier(CLK_LOCAL_MEM_FENCE);
  temp = local
@@ -1018,9 +1018,9 @@ To determine local memory size: **clGetDeviceInfo( ..., CL_DEVICE_LOCAL_MEM_SIZE
 
 All AMD Southern Islands, Sea Islands, and Volcanic Islands GPUs (collectively referred to as GCN devices) contain a 64 kB LDS for each compute unit; although only 32 kB can be allocated per work-group. The LDS contains 32- banks, each bank is four bytes wide and 256 bytes deep; the bank address is determined by bits 6:2 in the address. As shown below, programmers must carefully control the bank bits to avoid bank conflicts as much as possible. Bank conflicts are determined by what addresses are accessed on each half wavefront boundary. Threads 0 through 31 are checked for conflicts as are threads 32 through 63 within a wavefront.
 
-In a single cycle, local memory can service a request for each bank (up to 32 accesses each cycle on the AMD Radeon™ HD 7970 GPU). For an AMD Radeon™ HD 7970 GPU, this delivers a memory bandwidth of over 100 GB/s for each compute unit, and more than 3.5 TB/s for the whole chip. This is more than 14X the global memory bandwidth. However, accesses that map to the same bank are serialized and serviced on consecutive cycles. LDS operations do not stall; however, the compiler inserts wait operations prior to issuing operations that depend on the results. A wavefront that generated bank conflicts does not stall implicitly, but may stall explicitly in the kernel if the compiler has inserted a wait command for the outstanding memory access. The GPU reprocesses the wavefront on subsequent cycles, enabling only the lanes receiving data, until all the conflicting accesses complete. The bank with the most conflicting accesses determines the latency for the wavefront to complete the local memory operation. The worst case occurs when all 64 work-items map to the same bank, since each access then is serviced at a rate of one per clock cycle; this case takes 64 cycles to complete the local memory access for the wavefront. A program with a large number of bank conflicts (as measured by the LDSBankConflict performance counter in the CodeXL GPU Profiler statistics) might benefit from using the constant or image memory rather than LDS.
- 
-Thus, the key to effectively using the LDS is to control the access pattern, so that accesses generated on the same cycle map to different banks in the LDS. One notable exception is that accesses to the same address (even though they have the same bits 6:2) can be broadcast to all requestors and do not generate a bank conflict. The LDS hardware examines the requests generated over two cycles (32 work-items of execution) for bank conflicts. Ensure, as much as possible, that the memory requests generated from a quarter-wavefront avoid bank conflicts by using unique address bits 6:2. A simple sequential address pattern, where each work-item reads a float2 value from LDS, generates a conflict-free access pattern on the AMD Radeon™ HD 7XXX GPU. Note that a sequential access pattern, where each work-item reads a float4 value from LDS, uses only half the banks on each cycle on the AMD Radeon™ HD 7XXX GPU and delivers half the performance of the float access pattern.
+In a single cycle, local memory can service a request for each bank (up to 32 accesses each cycle on the AMD Radeon(TM) HD 7970 GPU). For an AMD Radeon(TM) HD 7970 GPU, this delivers a memory bandwidth of over 100 GB/s for each compute unit, and more than 3.5 TB/s for the whole chip. This is more than 14X the global memory bandwidth. However, accesses that map to the same bank are serialized and serviced on consecutive cycles. LDS operations do not stall; however, the compiler inserts wait operations prior to issuing operations that depend on the results. A wavefront that generated bank conflicts does not stall implicitly, but may stall explicitly in the kernel if the compiler has inserted a wait command for the outstanding memory access. The GPU reprocesses the wavefront on subsequent cycles, enabling only the lanes receiving data, until all the conflicting accesses complete. The bank with the most conflicting accesses determines the latency for the wavefront to complete the local memory operation. The worst case occurs when all 64 work-items map to the same bank, since each access then is serviced at a rate of one per clock cycle; this case takes 64 cycles to complete the local memory access for the wavefront. A program with a large number of bank conflicts (as measured by the LDSBankConflict performance counter in the CodeXL GPU Profiler statistics) might benefit from using the constant or image memory rather than LDS.
+
+Thus, the key to effectively using the LDS is to control the access pattern, so that accesses generated on the same cycle map to different banks in the LDS. One notable exception is that accesses to the same address (even though they have the same bits 6:2) can be broadcast to all requestors and do not generate a bank conflict. The LDS hardware examines the requests generated over two cycles (32 work-items of execution) for bank conflicts. Ensure, as much as possible, that the memory requests generated from a quarter-wavefront avoid bank conflicts by using unique address bits 6:2. A simple sequential address pattern, where each work-item reads a float2 value from LDS, generates a conflict-free access pattern on the AMD Radeon(TM) HD 7XXX GPU. Note that a sequential access pattern, where each work-item reads a float4 value from LDS, uses only half the banks on each cycle on the AMD Radeon(TM) HD 7XXX GPU and delivers half the performance of the float access pattern.
 
 Each stream processor can generate up to two 4-byte LDS requests per cycle. Byte and short reads consume four bytes of LDS bandwidth. Developers can use the large register file: each compute unit has 256 kB of register space available (8X the LDS size) and can provide up to twelve 4-byte values/cycle (6X the LDS bandwidth). Registers do not offer the same indexing flexibility as does the LDS, but for some algorithms this can be overcome with loop unrolling and explicit addressing.
 
@@ -1030,7 +1030,7 @@ The CodeXL GPU Profiler provides the following performance counter to help optim
 
 LDSBankConflict: The percentage of time accesses to the LDS are stalled due to bank conflicts relative to GPU Time. In the ideal case, there are no bank conflicts in the local memory access, and this number is zero.
 
-Local memory is software-controlled "scratchpad" memory. In contrast, caches typically used on CPUs monitor the access stream and automatically capture recent accesses in a tagged cache. The scratchpad allows the kernel to explicitly load items into the memory; they exist in local memory until the kernel replaces them, or until the work-group ends. To declare a block of local memory, use the ``___local`` keyword; 
+Local memory is software-controlled "scratchpad" memory. In contrast, caches typically used on CPUs monitor the access stream and automatically capture recent accesses in a tagged cache. The scratchpad allows the kernel to explicitly load items into the memory; they exist in local memory until the kernel replaces them, or until the work-group ends. To declare a block of local memory, use the ``___local`` keyword;
 for example:
 
  **__local float localBuffer[64]**
@@ -1040,7 +1040,7 @@ These declarations can be either in the parameters to the kernel call or in the
 To write data into local memory, write it into an array allocated with ``__local``.
 
 For example: **localBuffer[i] = 5.0;**
- 
+
 A typical access pattern is for each work-item to collaboratively write to the local memory: each work-item writes a subsection, and as the work-items execute in parallel they write the entire array. Combined with proper consideration for the access pattern and bank alignment, these collaborative write approaches can lead to highly efficient memory accessing.
 
 The following example is a simple kernel section that collaboratively writes, then reads from, local memory::
@@ -1048,22 +1048,22 @@ The following example is a simple kernel section that collaboratively writes, th
  __kernel void localMemoryExample (  global float *In, __global float *Out) {
  __local float localBuffer[64];
  uint tx = get_local_id(0);
- uint gx = get_global_id(0); 
+ uint gx = get_global_id(0);
 
  // Initialize local memory:
  // Copy from this work-group's section of global memory to local:
  // Each work-item writes one element; together they write it all
- localBuffer[tx] = In[gx]; 
- 
+ localBuffer[tx] = In[gx];
+
  // Ensure writes have completed:
- barrier(CLK_LOCAL_MEM_FENCE); 
- 
+ barrier(CLK_LOCAL_MEM_FENCE);
+
  // Toy computation to compute a partial factorial, shows re-use from local float f = localBuffer[tx];
  for (uint i=tx+1; i<64; i++) {
  f *= localBuffer[i];
  }
  Out[gx] = f;
- } 
+ }
 
 
 Note the host code cannot read from, or write to, local memory. Only the kernel can access local memory.
@@ -1073,7 +1073,7 @@ Local memory is consistent across work-items only at a work-group barrier; thus,
 2.3 Constant Memory Optimization
 ---------------------------------
 Constants (data from read-only buffers shared by a wavefront) are loaded to SGPRs from memory through the L1 (and L2) cache using scalar memory read instructions. The scalar instructions can use up to two SGPR sources per cycle; vector instructions can use one SGPR source per cycle. (There are 512 SGPRs per SIMD, 4 SIMDs per CU; so a 32 CU configuration like Tahiti has 256 kB of SGPRs.)
- 
+
 GCN hardware supports specific inline literal constants. These constants are
 "free" in that they do not increase code size::
 
@@ -1087,25 +1087,25 @@ GCN hardware supports specific inline literal constants. These constants are
   2.0
  -2.0
   4.0
- -4.0 
- 
+ -4.0
+
 Any other literal constant increases the code size by at least 32 bits.
 
 The AMD implementation of OpenCL provides three levels of performance for the "constant" memory type.
 
 1.Simple Direct-Addressing Patterns
   Very high bandwidth can be attained when the compiler has available the constant address at compile time and can embed the constant address into the instruction. Each processing element can load up to 4x4-byte direct- addressed constant values each cycle. Typically, these cases are limited to simple non-array constants and function parameters. The executing kernel loads the constants into scalar registers and concurrently populates the constant cache. The constant cache is a tagged cache. Typically each 16 8k cache is shared among four compute units. If the constant data is already present in the constant cache, the load is serviced by the cache and does not require any global memory bandwidth. The constant cache size varies from 4k to 48k per GPU.
-2.Same Index 
+2.Same Index
   Hardware acceleration also takes place when all work-items in a wavefront reference the same constant address. In this case, the data is loaded from memory one time, stored in the L1 cache, and then broadcast to all wave- fronts. This can reduce significantly the required memory bandwidth.
 3.Varying Index
   More sophisticated addressing patterns, including the case where each work- item accesses different indices, are not hardware accelerated and deliver the same performance as a global memory read with the potential for cache hits.
 
 To further improve the performance of the AMD OpenCL stack, two methods allow users to take advantage of hardware constant buffers. These are:
- 
+
 1. Globally scoped constant arrays. These arrays are initialized, globally scoped, and in the constant address space (as specified in section 6.5.3 of the OpenCL specification). If the size of an array is below 64 kB, it is placed in hardware constant buffers; otherwise, it uses global memory. An example of this is a lookup table for math functions.
 2. Per-pointer attribute specifying the maximum pointer size. This is specified using the max_constant_size(N) attribute. The attribute form conforms to section 6.10 of the OpenCL 1.0 specification. This attribute is restricted to top-level kernel function arguments in the constant address space. This restriction prevents a pointer of one size from being passed as an argument to a function that declares a different size. It informs the compiler that indices into the pointer remain inside this range and it is safe to allocate a constant buffer in hardware, if it fits. Using a constant pointer that goes outside of this range results in undefined behavior. All allocations are aligned on the 16-byte boundary. For example:
 
-:: 
+::
 
  kernel void mykernel(global int* a,
  constant int* b   attribute__((max_constant_size (65536)))
@@ -1113,8 +1113,8 @@ To further improve the performance of the AMD OpenCL stack, two methods allow us
  {
  size_t idx = get_global_id(0);
  a[idx] = b[idx & 0x3FFF];
- }  
- 
+ }
+
 
 A kernel that uses constant buffers must use CL_DEVICE_MAX_CONSTANT_ARGS to query the device for the maximum number of constant buffers the kernel can support. This value might differ from the maximum number of hardware constant buffers available. In this case, if the number of hardware constant buffers is less than the CL_DEVICE_MAX_CONSTANT_ARGS, the compiler allocates the largest constant buffers in hardware first and allocates the rest of the constant buffers in global memory. As an optimization, if a constant pointer **A** uses n bytes of memory, where n is less than 64 kB, and constant pointer **B** uses m bytes of memory, where m is less than (64 kB - n) bytes of memory, the compiler can allocate the constant buffer pointers in a single hardware constant buffer. This optimization can be applied recursively by treating the resulting allocation as a single allocation and finding the next smallest constant pointer that fits within the space left in the constant buffer.
 
@@ -1122,7 +1122,7 @@ A kernel that uses constant buffers must use CL_DEVICE_MAX_CONSTANT_ARGS to quer
 2.4 OpenCL Memory Resources: Capacity and Performance
 ------------------------------------------------------
 
-Table 2.1 summarizes the hardware capacity and associated performance for the structures associated with the five OpenCL Memory Types. This information specific to the AMD Radeon™ HD 7970 GPUs with 3 GB video memory.
+Table 2.1 summarizes the hardware capacity and associated performance for the structures associated with the five OpenCL Memory Types. This information specific to the AMD Radeon(TM) HD 7970 GPUs with 3 GB video memory.
 
 **Table 2.1 Hardware Performance Parameters**
 
@@ -1140,15 +1140,15 @@ Table 2.1 summarizes the hardware capacity and associated performance for the st
 +                    +---------------------------+---------+----------+----------------------+
 |                    | Varying-indexed constant  |         |          | ~0.14 bytes/cycle    |
 +--------------------+---------------------------+---------+----------+----------------------+
-| Images             | L1 Cache                  | 16k     | 512k¹    | 1 bytes/cycle        |
+| Images             | L1 Cache                  | 16k     | 512k1    | 1 bytes/cycle        |
 +                    +---------------------------+---------+----------+----------------------+
-|                    | L2 Cache                  |         | 768k²    | ~0.4 bytes/cycle     |
+|                    | L2 Cache                  |         | 768k2    | ~0.4 bytes/cycle     |
 +                    +---------------------------+---------+----------+----------------------+
 |                    | Global Memory             |         | 3G       | ~0.14 bytes/cycle    |
 +--------------------+---------------------------+---------+----------+----------------------+
 
-.. [1] ¹ Applies to images and buffers.
-.. [2] ² Applies to images and buffers.
+.. [1] 1 Applies to images and buffers.
+.. [2] 2 Applies to images and buffers.
 
 The compiler tries to map private memory allocations to the pool of GPRs in the GPU. In the event GPRs are not available, private memory is mapped to the "scratch" region, which has the same performance as global memory. Section 2.6.2, "Resource Limits on Active Wavefronts,", has more information on register allocation and identifying when the compiler uses the scratch region. GPRs provide the highest-bandwidth access of any hardware resource. In addition to reading up to 12 bytes/cycle per processing element from the register file, the hardware can access results produced in the previous cycle without consuming any register file bandwidth.
 
@@ -1160,12 +1160,12 @@ The L1 and L2 read/write caches are constantly enabled. Read only buffers can be
 
 The L1 cache can service up to four address requests per cycle, each delivering up to 16 bytes. The bandwidth shown assumes an access size of 16 bytes; smaller access sizes/requests result in a lower peak bandwidth for the L1 cache. Using float4 with images increases the request size and can deliver higher L1 cache bandwidth.
 
-Each memory channel on the GPU contains an L2 cache that can deliver up to 64 bytes/cycle. The AMD Radeon™ HD 7970 GPU has 12 memory channels; thus, it can deliver up to 768 bytes/cycle; divided among 2048 stream cores, this provides up to ~0.4 bytes/cycle for each stream core.
+Each memory channel on the GPU contains an L2 cache that can deliver up to 64 bytes/cycle. The AMD Radeon(TM) HD 7970 GPU has 12 memory channels; thus, it can deliver up to 768 bytes/cycle; divided among 2048 stream cores, this provides up to ~0.4 bytes/cycle for each stream core.
+
+Global Memory bandwidth is limited by external pins, not internal bus bandwidth. The AMD Radeon(TM) HD 7970 GPU supports up to 264 GB/s of memory bandwidth which is an average of 0.14 bytes/cycle for each stream core.
 
-Global Memory bandwidth is limited by external pins, not internal bus bandwidth. The AMD Radeon™ HD 7970 GPU supports up to 264 GB/s of memory bandwidth which is an average of 0.14 bytes/cycle for each stream core.
+Note that Table 2.1 shows the performance for the AMD Radeon(TM) HD 7970 GPU. The "Size/Compute Unit" column and many of the bandwidths/processing element apply to all Southern Islands-class GPUs; however, the "Size/GPU" column and the bandwidths for varying-indexed constant, L2, and global memory vary across different GPU devices.
 
-Note that Table 2.1 shows the performance for the AMD Radeon™ HD 7970 GPU. The "Size/Compute Unit" column and many of the bandwidths/processing element apply to all Southern Islands-class GPUs; however, the "Size/GPU" column and the bandwidths for varying-indexed constant, L2, and global memory vary across different GPU devices.
- 
 2.5 Using LDS or L1 Cache
 --------------------------
 
@@ -1173,7 +1173,7 @@ There are a number of considerations when deciding between LDS and L1 cache for
 
 LDS supports read/modify/write operations, as well as atomics. It is well-suited for code that requires fast read/write, read/modify/write, or scatter operations that otherwise are directed to global memory. On current AMD hardware, L1 is part of the read path; hence, it is suited to cache-read-sensitive algorithms, such as matrix multiplication or convolution.
 
-LDS is typically larger than L1 (for example: 64 kB vs 16 kB on Southern Islands devices). If it is not possible to obtain a high L1 cache hit rate for an algorithm, the larger LDS size can help. On the AMD Radeon™ HD 7970 device, the theoretical LDS peak bandwidth is 3.8 TB/s, compared to L1 at 1.9 TB/sec.
+LDS is typically larger than L1 (for example: 64 kB vs 16 kB on Southern Islands devices). If it is not possible to obtain a high L1 cache hit rate for an algorithm, the larger LDS size can help. On the AMD Radeon(TM) HD 7970 device, the theoretical LDS peak bandwidth is 3.8 TB/s, compared to L1 at 1.9 TB/sec.
 
 The native data type for L1 is a four-vector of 32-bit words. On L1, fill and read addressing are linked. It is important that L1 is initially filled from global memory with a coalesced access pattern; once filled, random accesses come at no extra processing cost.
 
@@ -1184,7 +1184,7 @@ From an application point of view, filling LDS from global memory, and reading f
 LDS reuses the data already pulled into cache by other wavefronts. Sharing across work-groups is not possible because OpenCL does not guarantee that LDS is in a particular state at the beginning of work-group execution. L1 content, on the other hand, is independent of work-group execution, so that successive work-groups can share the content in the L1 cache of a given Vector ALU. However, it currently is not possible to explicitly control L1 sharing across work- groups.
 
 The use of LDS is linked to GPR usage and wavefront-per-Vector ALU count. Better sharing efficiency requires a larger work-group, so that more work-items share the same LDS. Compiling kernels for larger work-groups typically results in increased register use, so that fewer wavefronts can be scheduled simultaneously per Vector ALU. This, in turn, reduces memory latency hiding. Requesting larger amounts of LDS per work-group results in fewer wavefronts per Vector ALU, with the same effect.
- 
+
 
 LDS typically involves the use of barriers, with a potential performance impact. This is true even for read-only use cases, as LDS must be explicitly filled in from global memory (after which a barrier is required before reads can commence).
 
@@ -1222,7 +1222,7 @@ Southern Islands registers are scalar, so each is 32-bits. Each wavefront can ha
 For example, a kernel that uses 120 registers (120x32-bit values) can run with eight active wavefronts on each compute unit. Because of the global limits described earlier, each compute unit is limited to 40 wavefronts; thus, kernels can use up to 25 registers (25x32-bit values) without affecting the number of wavefronts/compute unit.
 
 AMD provides the following tools to examine the number of general-purpose registers (GPRs) used by the kernel.
- 
+
 * The CodeXL GPU Profiler displays the number of GPRs used by the kernel.
 * Alternatively, the CodeXL GPU Profiler generates the ISA dump , which then can be searched for the string ``:NUM_GPRS.``
 * The AMD CodeXL Analysis Mode shows the GPR used by the kernel, across a wide variety of GPU compilation targets.
@@ -1243,7 +1243,7 @@ Section 6.7.2 of the OpenCL specification explains the attribute in more detail.
 2.6.2.3 Local Memory (LDS) Size
 ################################
 In addition to registers, shared memory can also serve to limit the active wavefronts/compute unit. Each compute unit has 64 kB of LDS, which is shared among all active work-groups. Note that the maximum allocation size is 32 kB. LDS is allocated on a per-work-group granularity, so it is possible (and useful) for multiple wavefronts to share the same local memory allocation. However, large LDS allocations eventually limits the number of workgroups that can be active. Table 2.2 provides more details about how LDS usage can impact the wavefronts/compute unit.
- 
+
 **Table 2.2  Effect of LDS Usage on Wavefronts/CU1**
 
 +-------------------------+----------------------------------------------------------------------+------------------------------------------------------------------------+------------------------------------------------------------------------+-------------------------------------------------------------------------+
@@ -1303,7 +1303,7 @@ OpenCL limits the number of work-items in each group. Call clDeviceInfo with the
 The fundamental unit of work on AMD GPUs is called a wavefront. Each wavefront consists of 64 work-items; thus, the optimal local work size is an integer multiple of 64 (specifically 64, 128, 192, or 256) work-items per work- group.
 
 Work-items in the same work-group can share data through LDS memory and also use high-speed local atomic operations. Thus, larger work-groups enable more work-items to efficiently share data, which can reduce the amount of slower global communication. However, larger work-groups reduce the number of global work-groups, which, for small workloads, could result in idle compute units. Generally, larger work-groups are better as long as the global range is big enough to provide 1-2 Work-Groups for each compute unit in the system; for small workloads it generally works best to reduce the work-group size in order to avoid idle compute units. Note that it is possible to make the decision dynamically, when the kernel is launched, based on the launch dimensions and the target device characteristics.
- 
+
 2.6.3.3 Work-Group Dimensions vs Size
 #######################################
 
@@ -1414,14 +1414,14 @@ Packed 16-bit and 8-bit operations are not natively supported; however, in cases
 
 The MAD instruction is an IEEE-compliant multiply followed by an IEEE- compliant add; it has the same accuracy as two separate MUL/ADD operations. No special compiler flags are required for the compiler to convert separate MUL/ADD operations to use the MAD instruction.
 
-Table 2.3 shows the throughput for each processing element. To obtain the peak throughput for the whole device, multiply the value in the table with the number of processing elements and the engine clock. For example, according to Table 2.3, an AMD Tahiti device can perform one double-precision ADD operations/2 cycles in each processing element. An AMD Radeon™ HD 7970 GPU has 2048 processing elements and an engine clock of 925 MHz, so the entire GPU has a throughput rate of (.5*2048*925 MHz) = 947 GFlops for double- precision adds.
+Table 2.3 shows the throughput for each processing element. To obtain the peak throughput for the whole device, multiply the value in the table with the number of processing elements and the engine clock. For example, according to Table 2.3, an AMD Tahiti device can perform one double-precision ADD operations/2 cycles in each processing element. An AMD Radeon(TM) HD 7970 GPU has 2048 processing elements and an engine clock of 925 MHz, so the entire GPU has a throughput rate of (.5*2048*925 MHz) = 947 GFlops for double- precision adds.
 
 Similarly, double-precision MADs on AMD Tahiti (including the AMD HD 79XX and the AMD R9 280 products) run at 1/4 rate. Double-precision MADs on AMD Hawaii have two rates: 1/2 rate for the AMD FirePro 9100 devices, and 1/8th rate for the non-FirePro AMD devices (AMD R9 290 for example). Double-precision MADs on the other GCN devices typically run at 1/16 rate.
 
 In general, the rate for double-precision ADD operations is double the rate for double-precision MAD or FMA operations.
 
 For information about the device parameters for some Southern Islands devices, see 2.10, "Device Parameters for Southern Islands Devices".
- 
+
 2.7.2 AMD Media Instructions
 +++++++++++++++++++++++++++++
 AMD provides a set of media instructions for accelerating media processing. Notably, the sum-of-absolute differences (SAD) operation is widely used in motion estimation algorithms. For the Southern Islands family of devices, new media instructions have been added; these are available under the ``cl_amd_media_ops2`` extensions.
@@ -1441,18 +1441,18 @@ The OpenCL compiler currently recognizes a few patterns and transforms them into
 
 * Bitfield extract on signed/unsigned integers.
   |(A >> B) & C ==> [u]bit_extract
- 
+
  where
- 
+
   | B and C are compile time constants,
   | A is a 8/16/32bit integer type, and
   | C is a mask.
 
 * Bitfield insert on signed/unsigned integers
   | ((A & B) << C) | ((D & E) << F ==> ubit_insert
- 
+
  where
- 
+
   | B and E have no conflicting bits (B^E == 0),
   | B, C, E, and F are compile-time constants, and
   | B and E are masks.
@@ -1472,7 +1472,7 @@ Examples for using this loop follow.
 ::
  No unrolling example::
 
-  #pragma unroll 1 
+  #pragma unroll 1
   for (int i = 0; i < n; i++) {
   ...
   }
@@ -1543,18 +1543,18 @@ In the second block of code, the ``?:`` operator executes in the vector units, s
  a[idx] = d[idx];
  }
 
-This is inefficient because the GPU compiler must know the base pointer that every load comes from and in this situation, the compiler cannot determine what â€˜d' points to. So, both B and C are assigned to the same GPU resource, removing the ability to do certain optimizations.
+This is inefficient because the GPU compiler must know the base pointer that every load comes from and in this situation, the compiler cannot determine what aEUR~d' points to. So, both B and C are assigned to the same GPU resource, removing the ability to do certain optimizations.
 
 *If the algorithm allows changing the work-group size, it is possible to get better performance by using larger work-groups (more work-items in each work-group) because the workgroup creation overhead is reduced. On the other hand, the OpenCL CPU runtime uses a task-stealing algorithm at the work-group level, so when the kernel execution time differs because it contains conditions and/or loops of varying number of iterations, it might be better to increase the number of work-groups. This gives the runtime more flexibility in scheduling work-groups to idle CPU cores. Experimentation might be needed to reach optimal work-group size.
 *Since the AMD OpenCL runtime supports only in-order queuing, using clFinish() on a queue and queuing a blocking command gives the same result. The latter saves the overhead of another API command.
 
 For example::
-  
+
   clEnqueueWriteBuffer(myCQ, buff, **CL_FALSE**, 0, buffSize, input, 0, NULL, NULL);
   clFinish(myCQ);
 
 is equivalent, for the AMD OpenCL runtime, to::
-  
+
   clEnqueueWriteBuffer(myCQ, buff, **CL_TRUE**, 0, buffSize, input, 0, NULL, NULL);
 
 * GPU ISA: GCN-based GPUs have 32KB of dedicated L1 instruction cache. A single instruction cache instance serves up to 4 CUs (depending upon the architecture family and device), with each CU holding up to 40 wavefronts. As each wavefront includes its own program counter, a single instruction cache unit may serve up to 160 wavefronts with each executing a different instruction in the program.
@@ -1567,8 +1567,8 @@ is equivalent, for the AMD OpenCL runtime, to::
 * Porting from CUDA to OpenCL is relatively straightforward. Multiple vendors have documents describing how to do this, including AMD:http://developer.amd.com/documentation/articles/pages/OpenCL-and-the-ATI-Stream-v2.0-Beta.aspx#four
 * Some specific performance recommendations which differ from other GPU architectures:
 
-  * Use a workgroup size that is a multiple of 64. CUDA code can use a workgroup size of 32; this uses only half the available compute resources on an AMD Radeon™ HD 7970 GPU.
-  * AMD GPUs have a very high single-precision flops capability (3.788 teraflops in a single AMD Radeon™ HD 7970 GPU). Algorithms that benefit from such throughput can deliver excellent performance on AMD hardware.
+  * Use a workgroup size that is a multiple of 64. CUDA code can use a workgroup size of 32; this uses only half the available compute resources on an AMD Radeon(TM) HD 7970 GPU.
+  * AMD GPUs have a very high single-precision flops capability (3.788 teraflops in a single AMD Radeon(TM) HD 7970 GPU). Algorithms that benefit from such throughput can deliver excellent performance on AMD hardware.
 
 2.8.5 Guidance for CPU Programmers Using OpenCL to Program GPUs
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
@@ -1617,7 +1617,7 @@ can be written as a composition of mad instructions which use fused multiple add
 2.8.6.4 Avoid Barriers When Possible
 #####################################
 Using barriers in a kernel on the CPU causes a significant performance penalty compared to the same kernel without barriers. Use a barrier only if the kernel requires it for correctness, and consider changing the algorithm to reduce barriers usage.
- 
+
 2.8.7 Optimizing Kernels for Southern Island GPUs
 ++++++++++++++++++++++++++++++++++++++++++++++++++
 
@@ -1625,10 +1625,10 @@ Using barriers in a kernel on the CPU causes a significant performance penalty c
 #######################################
 A conditional of the form "if-then-else" generates branching. Use the ``select()`` function to replace these structures with conditional assignments that do not cause branching. For example:
 ::
- 
+
  if(x==1) r=0.5;
  if(x==2) r=1.0;
- 
+
 becomes
 ::
 
@@ -1641,7 +1641,7 @@ Note that if the body of the ``if`` statement contains an I/O, the ``if`` statem
 ###############################
 
 A conditional expression with many terms can compile into nested conditional code due to the C-language requirement that expressions must short circuit. To prevent this, move the expression out of the control flow statement. For example::
- 
+
  if(a&&b&&c&&d){...}
 
 becomes
@@ -1662,7 +1662,7 @@ Because the GPU is a Vector ALU architecture, there is a cost to executing an ``
 2.8.7.5 Experiment With do/while/for Loops
 ###########################################
 ``for`` loops can generate more conditional code than equivalent ``do`` or ``while`` loops. Experiment with these different loop types to find the one with best performance.
- 
+
 
 2.9 Specific Guidelines for GCN family GPUs
 --------------------------------------------
@@ -1680,9 +1680,9 @@ Typical scalar instructions execute in four cycles. The scalar engine can accept
 
 Typical vector instructions execute in four cycles.  SIMDs within a compute unit can overlap vector instruction execution; each SIMD unit is offset by one cycle from the previous one. This allows each SIMD unit to execute one Vector ALU instruction and one scalar ALU instruction every four clocks.
 
-All GCN GPUs have double-precision support. For Tahiti (AMD Radeon™ HD 79XX series), double precision adds run at one-half the single precision add rate. Double-precision multiplies and MAD instructions run at one-quarter the floating- point rate.
- 
-The double-precision rate of Pitcairn (AMD Radeon™ HD 78XX series) and Cape Verde (AMD Radeon™ HD 77XX series) is one quarter that of Tahiti. This also affects the performance of single-precision fused multiple add (FMA).
+All GCN GPUs have double-precision support. For Tahiti (AMD Radeon(TM) HD 79XX series), double precision adds run at one-half the single precision add rate. Double-precision multiplies and MAD instructions run at one-quarter the floating- point rate.
+
+The double-precision rate of Pitcairn (AMD Radeon(TM) HD 78XX series) and Cape Verde (AMD Radeon(TM) HD 77XX series) is one quarter that of Tahiti. This also affects the performance of single-precision fused multiple add (FMA).
 
 Similar to previous generations local data share (LDS) is a shared resource within a compute unit. The maximum LDS allocation size for a work-group is still 32 kB, however each compute unit has a total of 64 kB of LDS. On SI GPUs, LDS memory has 32 banks; thus, it is important to be aware of LDS bank conflicts on half-wavefront boundaries. The allocation granularity for LDS is 256 bytes; the minimum size is 0 bytes. It is much easier to achieve high LDS bandwidth use on SI hardware.
 
@@ -1701,7 +1701,7 @@ Since there are no more clauses in the instruction set architecture (ISA) for GC
 * The engine is wider than previous generations; this means larger dispatches are required to keep the all the compute units busy.
 * A single wavefront can take twice as long to execute compared to previous generations (assuming ALU bound). This is because GPUs with VLIW-4 could execute the four instructions in a VLIW bundle in eight clocks (typical), and SI GPUs can execute one vector instruction in four clocks (typical).
 * Execution of kernel dispatches can overlap if there are no dependencies between them and if there are resources available in the GPU. This is critical when writing benchmarks it is important that the measurements are accurate and that "false dependencies" do not cause unnecessary slowdowns.An example of false dependency is:
- 
+
   a. Application creates a kernel "foo".
   b. Application creates input and output buffers.
   c. Application binds input and output buffers to kernel "foo".
@@ -1728,7 +1728,7 @@ Table 2.4 provides a simplified picture showing the Northern Island compute unit
 **Figure 2.4 Northern Islands Compute Unit Arrangement**
 
 Table 2.5 provides a simplified picture showing the Southern Island compute unit arrangement.
- 
+
 .. image:: Opencl_optimization_images/2.5.png
 
 **Figure 2.5 Southern Island Compute Unit Arrangement**
@@ -1744,7 +1744,7 @@ The following table provides device-specific information for some AMD Southern I
 +-------------------------------+-----------+----------+--------------+-------------+------------+-----------+
 |                               | Verde PRO | Verde XT | Pitcairn PRO | Pitcairn XT | Tahiti PRO | Tahiti XT |
 +===============================+===========+==========+==============+=============+============+===========+
-| Product Name (AMD Radeon™ HD) | 7750      | 7770     | 7850         | 7870        | 7950       | 7970      |
+| Product Name (AMD Radeon(TM) HD) | 7750      | 7770     | 7850         | 7870        | 7950       | 7970      |
 +-------------------------------+-----------+----------+--------------+-------------+------------+-----------+
 | Engine Speed (MHz)            | 800       | 1000     | 860          | 1000        | 800        | 925       |
 +-------------------------------+-----------+----------+--------------+-------------+------------+-----------+
@@ -1822,22 +1822,22 @@ This chapter discusses performance and optimization when programming for AMD GPU
 -------------------------------
 Figure 3.1 is a block diagram of the GPU memory system. The up arrows are read paths, the down arrows are write paths. WC is the write combine cache.
 
-The GPU consists of multiple compute units. Each compute unit contains 32 kB local (on-chip) memory, L1 cache, registers, and 16 processing element (PE). Each processing element contains a five-way (or four-way, depending on the GPU type) VLIW processor. Individual work-items execute on a single processing element; one or more work-groups execute on a single compute unit. On a GPU, hardware schedules the work-items. On the ATI Radeon™ HD 5000 series of GPUs, hardware schedules groups of work-items, called wavefronts, onto stream cores; thus, work-items within a wavefront execute in lock-step; the same instruction is executed on different data.
+The GPU consists of multiple compute units. Each compute unit contains 32 kB local (on-chip) memory, L1 cache, registers, and 16 processing element (PE). Each processing element contains a five-way (or four-way, depending on the GPU type) VLIW processor. Individual work-items execute on a single processing element; one or more work-groups execute on a single compute unit. On a GPU, hardware schedules the work-items. On the ATI Radeon(TM) HD 5000 series of GPUs, hardware schedules groups of work-items, called wavefronts, onto stream cores; thus, work-items within a wavefront execute in lock-step; the same instruction is executed on different data.
 
-The L1 cache is 8 kB per compute unit. (For the ATI Radeon™ HD 5870 GPU, this means 160 kB for the 20 compute units.) The L1 cache bandwidth on the ATI Radeon™ HD 5870 GPU is one terabyte per second:
+The L1 cache is 8 kB per compute unit. (For the ATI Radeon(TM) HD 5870 GPU, this means 160 kB for the 20 compute units.) The L1 cache bandwidth on the ATI Radeon(TM) HD 5870 GPU is one terabyte per second:
 
 	L1 Bandwidth = Compute Units * Wavefront Size/Compute Unit * EngineClock
 
 Multiple compute units share L2 caches.
-The L2 cache size on the ATI Radeon™ HD 5870 GPUs is 512 kB:L2 Cache Size = Number or channels * L2 per Channel
-The bandwidth between L1 caches and the shared L2 cache is 435 GB/s: 
+The L2 cache size on the ATI Radeon(TM) HD 5870 GPUs is 512 kB:L2 Cache Size = Number or channels * L2 per Channel
+The bandwidth between L1 caches and the shared L2 cache is 435 GB/s:
   L2 Bandwidth = Number of channels * Wavefront Size * Engine Clock
- 
+
 .. image:: Opencl_optimization_images/3.1.png
 
 **Figure 3.1 Memory System**
 
-The ATI Radeon™ HD 5870 GPU has eight memory controllers ("Memory Channel" in Figure 3.1). The memory controllers are connected to multiple banks of memory. The memory is GDDR5, with a clock speed of 1200 MHz and a data rate of 4800 Mb/pin. Each channel is 32-bits wide, so the peak bandwidth for the ATI Radeon™ HD 5870 GPU is: (8 memory controllers) * (4800 Mb/pin) * (32 bits) * (1 B/8b) = 154 GB/s
+The ATI Radeon(TM) HD 5870 GPU has eight memory controllers ("Memory Channel" in Figure 3.1). The memory controllers are connected to multiple banks of memory. The memory is GDDR5, with a clock speed of 1200 MHz and a data rate of 4800 Mb/pin. Each channel is 32-bits wide, so the peak bandwidth for the ATI Radeon(TM) HD 5870 GPU is: (8 memory controllers) * (4800 Mb/pin) * (32 bits) * (1 B/8b) = 154 GB/s
 
 If two memory access requests are directed to the same controller, the hardware serializes the access. This is called a channel conflict. Similarly, if two memory access requests go to the same memory bank, hardware serializes the access. This is called a bank conflict. From a developer's point of view, there is not much difference between channel and bank conflicts. A large power of two stride results in a channel conflict; a larger power of two stride results in a bank conflict. The size of the power of two stride that causes a specific type of conflict depends on the chip. A stride that results in a channel conflict on a machine with eight channels might result in a bank conflict on a machine with four.
 
@@ -1846,7 +1846,7 @@ In this document, the term bank conflict is used to refer to either kind of conf
 3.1.1 Two Memory Paths
 ++++++++++++++++++++++++
 
-ATI Radeon™ HD 5000 series graphics processors have two, independent memory paths between the compute units and the memory:
+ATI Radeon(TM) HD 5000 series graphics processors have two, independent memory paths between the compute units and the memory:
 
 * FastPath performs only basic operations, such as loads and stores (data sizes must be a multiple of 32 bits). This often is faster and preferred when there are no advanced operations.
 * CompletePath, supports additional advanced operations, including atomics and sub-32-bit (byte/short) data transfers.
@@ -1854,7 +1854,7 @@ ATI Radeon™ HD 5000 series graphics processors have two, independent memory pa
 3.1.1.1 Performance Impact of FastPath and CompletePath
 ########################################################
 
-There is a large difference in performance on ATI Radeon™ HD 5000 series hardware between FastPath and CompletePath. Figure 3.2 shows two kernels (one FastPath, the other CompletePath) and the delivered DRAM bandwidth for each kernel on the ATI Radeon™ HD 5870 GPU. Note that an atomic add forces CompletePath.
+There is a large difference in performance on ATI Radeon(TM) HD 5000 series hardware between FastPath and CompletePath. Figure 3.2 shows two kernels (one FastPath, the other CompletePath) and the delivered DRAM bandwidth for each kernel on the ATI Radeon(TM) HD 5870 GPU. Note that an atomic add forces CompletePath.
 
 
 .. image:: Opencl_optimization_images/3.2.png
@@ -1865,7 +1865,7 @@ There is a large difference in performance on ATI Radeon™ HD 5000 series hardw
 
 The kernel code follows. Note that the atomic extension must be enabled under OpenCL 1.0.
 ::
- 
+
  __kernel void
  CopyFastPath(  global const float * input,
    global float * output)
@@ -1883,7 +1883,7 @@ The kernel code follows. Note that the atomic extension must be enabled under Op
  }
  output[gid] = input[gid];
  return ;
- } 
+ }
 
 Table 3.1 lists the effective bandwidth and ratio to maximum bandwidth.
 
@@ -1916,7 +1916,7 @@ There are two ways to find out which path is used. The first method uses the Cod
 
 The second method is static and lets you determine the path by looking at a machine-level ISA listing (using the AMD CodeXL Static Kernel Analyzer in OpenCL).
 ::
- 
+
  MEM_RAT_CACHELESS -> FastPath
  MEM_RAT -> CompPath
  MEM_RAT_NOP_RTN -> Comp_load
@@ -1927,8 +1927,8 @@ FastPath operations appear in the listing as::
  TEX: ...
  ... VFETCH ...
  ... MEM_RAT_CACHELESS_STORE_RAW: ...
- ... 
- 
+ ...
+
 
 The ``vfetch` Instruction is a load type that in graphics terms is called vertex a fetch (the group control TEX indicates that the load uses the L1 cache.)
 The instruction ``MEM_RAT_CACHELESS`` indicates that FastPath operations are used. Loads in CompletePath are a split-phase operation. In the first phase, hardware copies the old value of a memory location into a special buffer. This is done by performing atomic operations on the memory location. After the value has reached the buffer, a normal load is used to read the value. Note that RAT stands for random access target, which is the same as an unordered access view (UAV); it allows, on DX11 hardware, writes to, and reads from, any arbitrary location in a buffer.
@@ -1950,7 +1950,7 @@ The instruction sequence means the following:
 **TEX** - Use the L1 cache for the next instruction.
 
 **VFETCH** - Do a load instruction to (finally) get the value.
- 
+
 
 Stores appear as:
 ::
@@ -1974,48 +1974,48 @@ When the application has complete control of the access pattern and address gene
 
 In this example::
 
- for (ptr=base; ptr<max; ptr += 16KB) 
+ for (ptr=base; ptr<max; ptr += 16KB)
   R0 = *ptr ;
 
 
 where the lower bits are all the same, the memory requests all access the same bank on the same channel and are processed serially.
 
 This is a low-performance pattern to be avoided. When the stride is a power of 2 (and larger than the channel interleave), the loop above only accesses one channel of memory.
- 
+
 The hardware byte address bits are:
 
 +------+--------------------+----------------+----------------+
 | 31:x | bank               | channel        | 7:0 address    |
 +------+--------------------+----------------+----------------+
 
-* On all ATI Radeon™ HD 5000-series GPUs, the lower eight bits select an element within a channel.
+* On all ATI Radeon(TM) HD 5000-series GPUs, the lower eight bits select an element within a channel.
 * The next set of bits select the channel. The number of channel bits varies, since the number of channels is not the same on all parts. With eight channels, three bits are used to select the channel; with two channels, a single bit is used.
 * The next set of bits selects the memory bank. The number of bits used depends on the number of memory banks.
 * The remaining bits are the rest of the address.
 
-On the ATI Radeon™ HD 5870 GPU, the channel selection are bits 10:8 of the byte address. This means a linear burst switches channels every 256 bytes. Since the wavefront size is 64, channel conflicts are avoided if each work-item in a wave reads a different address from a 64-word region. All ATI Radeon™ HD 5000 series GPUs have the same layout: channel ends at bit 8, and the memory bank is to the left of the channel.
+On the ATI Radeon(TM) HD 5870 GPU, the channel selection are bits 10:8 of the byte address. This means a linear burst switches channels every 256 bytes. Since the wavefront size is 64, channel conflicts are avoided if each work-item in a wave reads a different address from a 64-word region. All ATI Radeon(TM) HD 5000 series GPUs have the same layout: channel ends at bit 8, and the memory bank is to the left of the channel.
 
 A burst of 2kB (8 * 256 bytes) cycles through all the channels.
 
 When calculating an address as y*width+x, but reading a burst on a column (incrementing y), only one memory channel of the system is used, since the width is likely a multiple of 256 words = 2048 bytes. If the width is an odd multiple of 256B, then it cycles through all channels.
 
-Similarly, the bank selection bits on the ATI Radeon™ HD 5870 GPU are bits 14:11, so the bank switches every 2 kB. A linear burst of 32 kB cycles through all banks and channels of the system. If accessing a 2D surface along a column, with a y*width+x calculation, and the width is some multiple of 2 kB dwords (32 kB), then only 1 bank and 1 channel are accessed of the 16 banks and 8 channels available on this GPU.
+Similarly, the bank selection bits on the ATI Radeon(TM) HD 5870 GPU are bits 14:11, so the bank switches every 2 kB. A linear burst of 32 kB cycles through all banks and channels of the system. If accessing a 2D surface along a column, with a y*width+x calculation, and the width is some multiple of 2 kB dwords (32 kB), then only 1 bank and 1 channel are accessed of the 16 banks and 8 channels available on this GPU.
 
-All ATI Radeon™ HD 5000-series GPUs have an interleave of 256 bytes (64 dwords).
+All ATI Radeon(TM) HD 5000-series GPUs have an interleave of 256 bytes (64 dwords).
 
 If every work-item in a work-group references consecutive memory addresses and the address of work-item 0 is aligned to 256 bytes and each work-item fetches 32 bits, the entire wavefront accesses one channel. Although this seems slow, it actually is a fast pattern because it is necessary to consider the memory access over the entire device, not just a single wavefront.
 
-One or more work-groups execute on each compute unit. On the ATI Radeon™ HD 5000-series GPUs, work-groups are dispatched in a linear order, with x changing most rapidly. For a single dimension, this is:
+One or more work-groups execute on each compute unit. On the ATI Radeon(TM) HD 5000-series GPUs, work-groups are dispatched in a linear order, with x changing most rapidly. For a single dimension, this is:
 
 **DispatchOrder = get_group_id(0)**
- 
+
 For two dimensions, this is:
 
 **DispatchOrder = get_group_id(0) + get_group_id(1) * get_num_groups(0)**
 
 This is row-major-ordering of the blocks in the index space. Once all compute units are in use, additional work-groups are assigned to compute units as needed. Work-groups retire in order, so active work-groups are contiguous.
 
-At any time, each compute unit is executing an instruction from a single wavefront. In memory intensive kernels, it is likely that the instruction is a memory access. Since there are eight channels on the ATI Radeon™ HD 5870 GPU, at most eight of the compute units can issue a memory access operation in one cycle. It is most efficient if the accesses from eight wavefronts go to different channels. One way to achieve this is for each wavefront to access consecutive groups of 256 = 64 * 4 bytes.
+At any time, each compute unit is executing an instruction from a single wavefront. In memory intensive kernels, it is likely that the instruction is a memory access. Since there are eight channels on the ATI Radeon(TM) HD 5870 GPU, at most eight of the compute units can issue a memory access operation in one cycle. It is most efficient if the accesses from eight wavefronts go to different channels. One way to achieve this is for each wavefront to access consecutive groups of 256 = 64 * 4 bytes.
 
 An inefficient access pattern is if each wavefront accesses all the channels. This is likely to happen if consecutive work-items access data that has a large power of two strides.
 
@@ -2034,7 +2034,7 @@ The kernel code is::
  int idx = get_global_id(0);
  int idy = get_global_id(1);
  C(idy, idx) = A( idy, idx);
- } 
+ }
 
 By changing the width, the data type and the work-group dimensions, we get a set of kernels out of this code.
 
@@ -2048,8 +2048,8 @@ Table 3.2 shows how much the launch dimension can affect performance. It lists e
  Kernel                Effective Bandwidth   Ratio to Peak Bandwidth
 ==================== ====================== ==========================
  copy 32-bit 1D FP	        96 GB/s	              63%
- copy 32-bit 1D CP          18 GB/s	              12% 
- copy 32-bit 2D	      .3 - 93 GB/s 	             0 - 60% 
+ copy 32-bit 1D CP          18 GB/s	              12%
+ copy 32-bit 2D	      .3 - 93 GB/s 	             0 - 60%
  copy 128-bit 2D	      7 - 122 GB/s	             5 - 80%
 ==================== ====================== ==========================
 
@@ -2074,7 +2074,7 @@ Figure 3.3 illustrates the transformation to staggered offsets.
 
 
 .. [1] Generally, it is not a good idea to make the work-group size something other than an integer multiple of the wavefront size, but that usually is less important than avoiding channel conflicts.
- 
+
 
 The global ID values reflect the order that the hardware initiates work-groups. The values of get group ID are in ascending launch order.
 ::
@@ -2099,7 +2099,7 @@ To transform the code, add the following four lines to the top of the kernel.
  get_global_id_1 = get_group_id_1 * get_local_size(1) + get_local_id(1);
 
 Then, change the global IDs and group IDs to the staggered form. The result is::
- 
+
  __kernel void
  copy_float (
   __global const DATA_TYPE * A,
@@ -2108,27 +2108,27 @@ Then, change the global IDs and group IDs to the staggered form. The result is::
   size_t get_group_id_0 = get_group_id(0);
   size_t get_group_id_1 = (get_group_id(0) + get_group_id(1)) % get_local_size(0);
   size_t get_global_id_0 = get_group_id_0 * get_local_size(0) + get_local_id(0);
-  size_t get_global_id_1 = get_group_id_1 * get_local_size(1) + get_local_id(1); 
+  size_t get_global_id_1 = get_group_id_1 * get_local_size(1) + get_local_id(1);
   int idx = get_global_id_0; //changed to staggered form int idy = get_global_id_1; //changed to staggered form
- 
+
   C(idy , idx) = A( idy , idx);
- 
+
  }
 
 3.1.2.2 Reads Of The Same Address
 ##################################
 Under certain conditions, one unexpected case of a channel conflict is that reading from the same address is a conflict, even on the FastPath.
- 
+
 This does not happen on the read-only memories, such as constant buffers, textures, or shader resource view (SRV); but it is possible on the read/write UAV memory or OpenCL global memory.
 
 From a hardware standpoint, reads from a fixed address have the same upper bits, so they collide and are serialized. To read in a single value, read the value
 in a single work-item, place it in local memory, and then use that location:
- 
+
 **Avoid:**
 	``temp = input[3] // if input is from global space``
 
 **Use:**
-	 if (get_local_id(0) == 0) 
+	 if (get_local_id(0) == 0)
 	 {
 	 local = input[3]
  	 }
@@ -2137,7 +2137,7 @@ in a single work-item, place it in local memory, and then use that location:
 
 3.1.3 Float4 Or Float1
 +++++++++++++++++++++++
-The internal memory paths on ATI Radeon™ HD 5000-series devices support 128-bit transfers. This allows for greater bandwidth when transferring data in float4 format. In certain cases (when the data size is a multiple of four), float4 operations are faster.
+The internal memory paths on ATI Radeon(TM) HD 5000-series devices support 128-bit transfers. This allows for greater bandwidth when transferring data in float4 format. In certain cases (when the data size is a multiple of four), float4 operations are faster.
 
 The performance of these kernels can be seen in Figure 3.4. Change to float4 after eliminating the conflicts.
 
@@ -2171,8 +2171,8 @@ Copying data as float4 gives the best result: 84% of absolute peak. It also spee
 ==================== ======================= ==========================
     Kernel             Effective Bandwidth    Ratio to Peak Bandwidth
 ==================== ======================= ==========================
- copy 32-bit 1D FP           96 GB/s                    63% 
- copy 32-bit 1D CP	         18 GB/s                    12% 
+ copy 32-bit 1D FP           96 GB/s                    63%
+ copy 32-bit 1D CP	         18 GB/s                    12%
  copy 32-bit 2D	       .3 - 93 GB/s                0 - 61%
  copy 128-bit 2D	       7 - 122 GB/s	               5 - 80%
  copy4 float4 1D FP	        127 GB/s	                83%
@@ -2181,13 +2181,13 @@ Copying data as float4 gives the best result: 84% of absolute peak. It also spee
 3.1.4 Coalesced Writes
 +++++++++++++++++++++++
 
-On some other vendor devices, it is important to reorder your data to use coalesced writes. The ATI Radeon™ HD 5000-series devices also support coalesced writes, but this optimization is less important than other considerations, such as avoiding bank conflicts.
+On some other vendor devices, it is important to reorder your data to use coalesced writes. The ATI Radeon(TM) HD 5000-series devices also support coalesced writes, but this optimization is less important than other considerations, such as avoiding bank conflicts.
 
-In non-coalesced writes, each compute unit accesses the memory system in quarter-wavefront units. The compute unit transfers a 32-bit address and one element-sized piece of data for each work-item. This results in a total of 16 elements + 16 addresses per quarter-wavefront. On ATI Radeon™ HD 5000-series devices, processing quarter-wavefront requires two cycles before the data is transferred to the memory controller.
+In non-coalesced writes, each compute unit accesses the memory system in quarter-wavefront units. The compute unit transfers a 32-bit address and one element-sized piece of data for each work-item. This results in a total of 16 elements + 16 addresses per quarter-wavefront. On ATI Radeon(TM) HD 5000-series devices, processing quarter-wavefront requires two cycles before the data is transferred to the memory controller.
 
 In coalesced writes, the compute unit transfers one 32-bit address and 16 element-sized pieces of data for each quarter-wavefront, for a total of 16 elements +1 address per quarter-wavefront. For coalesced writes, processing quarter-wavefront takes one cycle instead of two. While this is twice as fast, the times are small compared to the rate the memory controller can handle the data. See Figure 3.5.
- 
-On ATI Radeon™ HD 5000-series devices, the coalescing is only done on the FastPath because it supports only 32-bit access.
+
+On ATI Radeon(TM) HD 5000-series devices, the coalescing is only done on the FastPath because it supports only 32-bit access.
 
 If a work-item does not write, coalesce detection ignores it.
 
@@ -2210,8 +2210,8 @@ The following are sample kernels with different coalescing patterns.
  {
  uint gid = get_global_id(0); output[gid] = input[gid]; return;
  }
-   
- 
+
+
    kernel void NoCoal (  global const float * input,
    global float * output)
  // (shift by 16)
@@ -2230,13 +2230,13 @@ The following are sample kernels with different coalescing patterns.
  {
  int gid = get_global_id(0);
  if((gid & 0x1) == 0) {
- gid = (gid & (Ëœ63)) +62 - get_local_id(0);
+ gid = (gid & (Eoe63)) +62 - get_local_id(0);
  }
  output[gid] = input[gid];
  return;
  }
-    
- 
+
+
 Table 3.4 lists the effective bandwidth and ratio to maximum bandwidth for each kernel type.
 
 **Table 3.4 Bandwidths Including Coalesced Writes**
@@ -2246,11 +2246,11 @@ Table 3.4 lists the effective bandwidth and ratio to maximum bandwidth for each
 ===================== ========================== ============================
  copy 32-bit 1D FP              96 GB/s                     63%
  copy 32-bit 1D CP		  18 GB/s	                    12%
- copy 32-bit 2D	          .3 - 93 GB/s	         0 - 61% 
- copy 128-bit 2D	           7 - 122 GB/s	         5 - 80% 
- copy4 float4 1D FP		 127 GB/s	                    83% 
+ copy 32-bit 2D	          .3 - 93 GB/s	         0 - 61%
+ copy 128-bit 2D	           7 - 122 GB/s	         5 - 80%
+ copy4 float4 1D FP		 127 GB/s	                    83%
  Coal 32-bit                    97 GB/s                     63%
- NoCoal 32-bit		  93 GB/s	                    61% 
+ NoCoal 32-bit		  93 GB/s	                    61%
  Split 32-bit		  90 GB/s	                    59%
 ===================== ========================== ============================
 
@@ -2259,7 +2259,7 @@ There is not much performance difference, although the coalesced version is slig
 3.1.5 Alignment
 ++++++++++++++++
 The program in Figure 3.6 shows how the performance of a simple, unaligned access (float1) of this kernel varies as the size of offset varies. Each transfer was large (16 MB). The performance gain by adjusting alignment is small, so generally this is not an important consideration on AMD GPUs.
- 
+
 .. image:: Opencl_optimization_images/3.6.png
 
 
@@ -2293,7 +2293,7 @@ Table 3.5 lists the effective bandwidth and ratio to maximum bandwidth for each
  Split 32-bit                   90 GB/s                      59%
  CopyAdd 32-bit                 92 GB/s                      60%
 ====================== ========================== ===========================
-			 
+
 
 3.1.6 Summary of Copy Performance
 ++++++++++++++++++++++++++++++++++
@@ -2313,14 +2313,14 @@ The recommended order of steps to improve performance is:
 AMD Evergreen GPUs include a Local Data Store (LDS) cache, which accelerates local memory accesses. LDS is not supported in OpenCL on AMD R700-family GPUs. LDS provides high-bandwidth access (more than 10X higher than global memory), efficient data transfers between work-items in a work- group, and high-performance atomic support. Local memory offers significant advantages when the data is re-used; for example, subsequent accesses can read from local memory, thus reducing global memory bandwidth. Another advantage is that local memory does not require coalescing.
 
 To determine local memory size::
- 
+
  clGetDeviceInfo( ..., CL_DEVICE_LOCAL_MEM_SIZE, ... );
 
 All AMD Evergreen GPUs contain a 32K LDS for each compute unit. On high- end GPUs, the LDS contains 32-banks, each bank is four bytes wide and 256 bytes deep; the bank address is determined by bits 6:2 in the address. On lower- end GPUs, the LDS contains 16 banks, each bank is still 4 bytes in size, and the bank used is determined by bits 5:2 in the address. As shown below, programmers should carefully control the bank bits to avoid bank conflicts as much as possible.
 
-In a single cycle, local memory can service a request for each bank (up to 32 accesses each cycle on the ATI Radeon™ HD 5870 GPU). For an ATI Radeon™ HD 5870 GPU, this delivers a memory bandwidth of over 100 GB/s for each compute unit, and more than 2 TB/s for the whole chip. This is more than 14X the global memory bandwidth. However, accesses that map to the same bank are serialized and serviced on consecutive cycles. A wavefront that generates bank conflicts stalls on the compute unit until all LDS accesses have completed. The GPU reprocesses the wavefront on subsequent cycles, enabling only the lanes receiving data, until all the conflicting accesses complete. The bank with the most conflicting accesses determines the latency for the wavefront to complete the local memory operation. The worst case occurs when all 64 work- items map to the same bank, since each access then is serviced at a rate of one per clock cycle; this case takes 64 cycles to complete the local memory access for the wavefront. A program with a large number of bank conflicts (as measured by the LDSBankConflict performance counter) might benefit from using the constant or image memory rather than LDS.
+In a single cycle, local memory can service a request for each bank (up to 32 accesses each cycle on the ATI Radeon(TM) HD 5870 GPU). For an ATI Radeon(TM) HD 5870 GPU, this delivers a memory bandwidth of over 100 GB/s for each compute unit, and more than 2 TB/s for the whole chip. This is more than 14X the global memory bandwidth. However, accesses that map to the same bank are serialized and serviced on consecutive cycles. A wavefront that generates bank conflicts stalls on the compute unit until all LDS accesses have completed. The GPU reprocesses the wavefront on subsequent cycles, enabling only the lanes receiving data, until all the conflicting accesses complete. The bank with the most conflicting accesses determines the latency for the wavefront to complete the local memory operation. The worst case occurs when all 64 work- items map to the same bank, since each access then is serviced at a rate of one per clock cycle; this case takes 64 cycles to complete the local memory access for the wavefront. A program with a large number of bank conflicts (as measured by the LDSBankConflict performance counter) might benefit from using the constant or image memory rather than LDS.
 
-Thus, the key to effectively using the local cache memory is to control the access pattern so that accesses generated on the same cycle map to different banks in the local memory. One notable exception is that accesses to the same address (even though they have the same bits 6:2) can be broadcast to all requestors and do not generate a bank conflict. The LDS hardware examines the requests generated over two cycles (32 work-items of execution) for bank conflicts. Ensure, as much as possible, that the memory requests generated from a quarter-wavefront avoid bank conflicts by using unique address bits 6:2. A simple sequential address pattern, where each work-item reads a float2 value from LDS, generates a conflict-free access pattern on the ATI Radeon™ HD 5870 GPU. Note that a sequential access pattern, where each work-item reads a float4 value from LDS, uses only half the banks on each cycle on the ATI Radeon™ HD 5870 GPU and delivers half the performance of the float access pattern.
+Thus, the key to effectively using the local cache memory is to control the access pattern so that accesses generated on the same cycle map to different banks in the local memory. One notable exception is that accesses to the same address (even though they have the same bits 6:2) can be broadcast to all requestors and do not generate a bank conflict. The LDS hardware examines the requests generated over two cycles (32 work-items of execution) for bank conflicts. Ensure, as much as possible, that the memory requests generated from a quarter-wavefront avoid bank conflicts by using unique address bits 6:2. A simple sequential address pattern, where each work-item reads a float2 value from LDS, generates a conflict-free access pattern on the ATI Radeon(TM) HD 5870 GPU. Note that a sequential access pattern, where each work-item reads a float4 value from LDS, uses only half the banks on each cycle on the ATI Radeon(TM) HD 5870 GPU and delivers half the performance of the float access pattern.
 
 Each stream processor can generate up to two 4-byte LDS requests per cycle. Byte and short reads consume four bytes of LDS bandwidth. Since each stream processor can execute five operations (or four, depending on the GPU type) in the VLIW each cycle (typically requiring 10-15 input operands), two local memory requests might not provide enough bandwidth to service the entire instruction. Developers can use the large register file: each compute unit has 256 kB of register space available (8X the LDS size) and can provide up to twelve 4-byte values/cycle (6X the LDS bandwidth). Registers do not offer the same indexing flexibility as does the LDS, but for some algorithms this can be overcome with loop unrolling and explicit addressing.
 
@@ -2342,22 +2342,22 @@ The following example is a simple kernel section that collaboratively writes, th
  __kernel void localMemoryExample (  global float *In,  global float *Out) {
  __local float localBuffer[64];
  uint tx = get_local_id(0);
- uint gx = get_global_id(0); 
- 
+ uint gx = get_global_id(0);
+
  // Initialize local memory:
  // Copy from this work-group's section of global memory to local:
  // Each work-item writes one element; together they write it all
- localBuffer[tx] = In[gx]; 
- 
+ localBuffer[tx] = In[gx];
+
  // Ensure writes have completed:
- barrier(CLK_LOCAL_MEM_FENCE); 
- 
+ barrier(CLK_LOCAL_MEM_FENCE);
+
  // Toy computation to compute a partial factorial, shows re-use from local float f = localBuffer[tx];
  for (uint i=tx+1; i<64; i++) {
  f *= localBuffer[i];
  }
  Out[gx] = f;
- } 
+ }
 
 
 .. Note:: the host code cannot read from, or write to, local memory. Only the kernel can access local memory.
@@ -2383,18 +2383,18 @@ The AMD implementation of OpenCL provides three levels of performance for the "c
 To further improve the performance of the AMD OpenCL stack, two methods allow users to take advantage of hardware constant buffers. These are:
 
 1. Globally scoped constant arrays. These arrays are initialized, globally scoped, and in the constant address space (as specified in section 6.5.3 of the OpenCL specification). If the size of an array is below 64 kB, it is placed in hardware constant buffers; otherwise, it uses global memory. An example of this is a lookup table for math functions.
-2. Per-pointer attribute specifying the maximum pointer size. This is specified using the max_constant_size(N) attribute. The attribute form conforms to section 6.10 of the OpenCL 1.0 specification. This attribute is restricted to top-level kernel function arguments in the constant address space. This restriction prevents a pointer of one size from being passed as an argument to a function that declares a different size. It informs the compiler that indices into the pointer remain inside this range and it is safe to allocate a constant buffer in hardware, if it fits. Using a constant pointer that goes outside of this range results in undefined behavior. All allocations are aligned on the 16-byte boundary. 
+2. Per-pointer attribute specifying the maximum pointer size. This is specified using the max_constant_size(N) attribute. The attribute form conforms to section 6.10 of the OpenCL 1.0 specification. This attribute is restricted to top-level kernel function arguments in the constant address space. This restriction prevents a pointer of one size from being passed as an argument to a function that declares a different size. It informs the compiler that indices into the pointer remain inside this range and it is safe to allocate a constant buffer in hardware, if it fits. Using a constant pointer that goes outside of this range results in undefined behavior. All allocations are aligned on the 16-byte boundary.
 
 For example::
- 
+
  kernel void mykernel(global int* a,
  constant int* b   attribute__((max_constant_size (65536)))
  )
  {
  size_t idx = get_global_id(0);
  a[idx] = b[idx & 0x3FFF];
- } 
- 
+ }
+
 
 A kernel that uses constant buffers must use CL_DEVICE_MAX_CONSTANT_ARGS to query the device for the maximum number of constant buffers the kernel can support. This value might differ from the maximum number of hardware constant buffers available. In this case, if the number of hardware constant buffers is less than the CL_DEVICE_MAX_CONSTANT_ARGS, the compiler allocates the largest constant buffers in hardware first and allocates the rest of the constant buffers in global memory. As an optimization, if a constant pointer **A** uses n bytes of memory, where n is less than 64 kB, and constant pointer **B** uses m bytes of memory, where m is less than (64 kB - n) bytes of memory, the compiler can allocate the constant buffer pointers in a single hardware constant buffer. This optimization can be applied recursively by treating the resulting allocation as a single allocation and finding the next smallest constant pointer that fits within the space left in the constant buffer.
 
@@ -2403,7 +2403,7 @@ A kernel that uses constant buffers must use CL_DEVICE_MAX_CONSTANT_ARGS to quer
 ------------------------------------------------------
 
 Table 3.6 summarizes the hardware capacity and associated performance for the structures associated with the five OpenCL Memory Types. This information
-specific to the ATI Radeon™ HD5870 GPUs with 1 GB video memory.
+specific to the ATI Radeon(TM) HD5870 GPUs with 1 GB video memory.
 
 **Table 3.6 Hardware Performance Parameters**
 
@@ -2434,19 +2434,19 @@ Same-indexed constants can be cached in the L1 and L2 cache. Note that "same-ind
 Varying-indexed constants use the same path as global memory access and are subject to the same bank and alignment constraints described in Section 3.1, "Global Memory Optimization".
 
 The L1 and L2 caches are currently only enabled for images and same-indexed constants. Read only buffers can be cached in L1 and L2. To enable this, the developer must indicate to the compiler that the buffer is read only and does not alias with other buffers. For example, use::
- 
+
  kernel void mykernel(  global int const * restrict mypointerName)
 
 The ``const`` indicates to the compiler that mypointerName is read only from the kernel, and the ``restrict`` attribute indicates to the compiler that no other pointer aliases with ``mypointerName``.
 
 The L1 cache can service up to four address request per cycle, each delivering up to 16 bytes. The bandwidth shown assumes an access size of 16 bytes; smaller access sizes/requests result in a lower peak bandwidth for the L1 cache. Using float4 with images increases the request size and can deliver higher L1 cache bandwidth.
 
-Each memory channel on the GPU contains an L2 cache that can deliver up to 64 bytes/cycle. The ATI Radeon™ HD 5870 GPU has eight memory channels; thus, it can deliver up to 512bytes/cycle; divided among 320 stream cores, this provides up to ~1.6 bytes/cycle for each stream core.
+Each memory channel on the GPU contains an L2 cache that can deliver up to 64 bytes/cycle. The ATI Radeon(TM) HD 5870 GPU has eight memory channels; thus, it can deliver up to 512bytes/cycle; divided among 320 stream cores, this provides up to ~1.6 bytes/cycle for each stream core.
+
+Global Memory bandwidth is limited by external pins, not internal bus bandwidth. The ATI Radeon(TM) HD 5870 GPU supports up to 153 GB/s of memory bandwidth which is an average of 0.6 bytes/cycle for each stream core.
 
-Global Memory bandwidth is limited by external pins, not internal bus bandwidth. The ATI Radeon™ HD 5870 GPU supports up to 153 GB/s of memory bandwidth which is an average of 0.6 bytes/cycle for each stream core.
+Note that Table 3.6 shows the performance for the ATI Radeon(TM) HD 5870 GPU. The "Size/Compute Unit" column and many of the bandwidths/processing element apply to all Evergreen-class GPUs; however, the "Size/GPU" column and the bandwidths for varying-indexed constant, L2, and global memory vary across different GPU devices.
 
-Note that Table 3.6 shows the performance for the ATI Radeon™ HD 5870 GPU. The "Size/Compute Unit" column and many of the bandwidths/processing element apply to all Evergreen-class GPUs; however, the "Size/GPU" column and the bandwidths for varying-indexed constant, L2, and global memory vary across different GPU devices.
- 
 
 3.5 Using LDS or L1 Cache
 --------------------------
@@ -2466,7 +2466,7 @@ From an application point of view, filling LDS from global memory, and reading f
 LDS is shared between the work-items in a work-group. Sharing across work- groups is not possible because OpenCL does not guarantee that LDS is in a particular state at the beginning of work-group execution. L1 content, on the other hand, is independent of work-group execution, so that successive work-groups can share the content in the L1 cache of a given Vector ALU. However, it currently is not possible to explicitly control L1 sharing across work-groups.
 
 The use of LDS is linked to GPR usage and wavefront-per-Vector ALU count. Better sharing efficiency requires a larger work-group, so that more work items share the same LDS. Compiling kernels for larger work groups typically results in increased register use, so that fewer wavefronts can be scheduled simultaneously per Vector ALU. This, in turn, reduces memory latency hiding. Requesting larger amounts of LDS per work-group results in fewer wavefronts per Vector ALU, with the same effect.
- 
+
 LDS typically involves the use of barriers, with a potential performance impact. This is true even for read-only use cases, as LDS must be explicitly filled in from global memory (after which a barrier is required before reads can commence).
 
 3.6 NDRange and Execution Range Optimization
@@ -2492,7 +2492,7 @@ Increasing the wavefronts/compute unit does not indefinitely improve performance
 AMD GPUs have two important global resource constraints that limit the number of in-flight wavefronts:
 
 * Each compute unit supports a maximum of eight work-groups. Recall that AMD OpenCL supports up to 256 work-items (four wavefronts) per work- group; effectively, this means each compute unit can support up to 32 wavefronts.
-* Each GPU has a global (across all compute units) limit on the number of active wavefronts. The GPU hardware is generally effective at balancing the load across available compute units. Thus, it is useful to convert this global limit into an average wavefront/compute unit so that it can be compared to the other limits discussed in this section. For example, the ATI Radeon™ HD 5870 GPU has a global limit of 496 wavefronts, shared among 20 compute units. Thus, it supports an average of 24.8 wavefronts/compute unit. Some AMD GPUs support up to 96 wavefronts/compute unit.
+* Each GPU has a global (across all compute units) limit on the number of active wavefronts. The GPU hardware is generally effective at balancing the load across available compute units. Thus, it is useful to convert this global limit into an average wavefront/compute unit so that it can be compared to the other limits discussed in this section. For example, the ATI Radeon(TM) HD 5870 GPU has a global limit of 496 wavefronts, shared among 20 compute units. Thus, it supports an average of 24.8 wavefronts/compute unit. Some AMD GPUs support up to 96 wavefronts/compute unit.
 
 These limits are largely properties of the hardware and, thus, difficult for developers to control directly. Fortunately, these are relatively generous limits. Frequently, the register and LDS usage in the kernel determines the limit on the
 
@@ -2589,7 +2589,7 @@ For example, if the compiler allocates 70 registers for the work-item, Table 3.7
 Fortunately, OpenCL provides a mechanism to specify a work-group size that the compiler can use to optimize the register allocation. In particular, specifying a smaller work-group size at compile time allows the compiler to allocate more registers for each kernel, which can avoid spill code and improve performance.
 
 The kernel attribute syntax is::
- 
+
  __attribute ((reqd_work_group_size(X, Y, Z)))
 
 Section 6.7.2 of the OpenCL specification explains the attribute in more detail.
@@ -2724,7 +2724,7 @@ The total number of work-items in the work-group is typically the most important
 3.6.4 Optimizing for Cedar
 ++++++++++++++++++++++++++
 
-To focus the discussion, this section has used specific hardware characteristics that apply to most of the Evergreen series. The value Evergreen part, referred to as Cedar and used in products such as the ATI Radeon™ HD 5450 GPU, has different architecture characteristics, as shown below.
+To focus the discussion, this section has used specific hardware characteristics that apply to most of the Evergreen series. The value Evergreen part, referred to as Cedar and used in products such as the ATI Radeon(TM) HD 5450 GPU, has different architecture characteristics, as shown below.
 
 
 +-------------------------+-------------------------------------+-----------------+
@@ -2749,9 +2749,9 @@ The difference in total register size can impact the compiled code and cause reg
 +++++++++++++++++++++++++++++++++++++++
 
 As shown above, execution range optimization is a complex topic with many interacting variables and which frequently requires some experimentation to determine the optimal values. Some general guidelines are:
- 
+
  * Select the work-group size to be a multiple of 64, so that the wavefronts are fully populated.
- * Always provide at least two wavefronts (128 work-items) per compute unit. For a ATI Radeon™ HD 5870 GPU, this implies 40 wave-fronts or 2560 work- items. If necessary, reduce the work-group size (but not below 64 work- items) to provide work-groups for all compute units in the system.
+ * Always provide at least two wavefronts (128 work-items) per compute unit. For a ATI Radeon(TM) HD 5870 GPU, this implies 40 wave-fronts or 2560 work- items. If necessary, reduce the work-group size (but not below 64 work- items) to provide work-groups for all compute units in the system.
  * Latency hiding depends on both the number of wavefronts/compute unit, as well as the execution time for each kernel. Generally, two to eight wavefronts/compute unit is desirable, but this can vary significantly, depending on the complexity of the kernel and the available memory bandwidth. The CodeXL GPU Profiler and associated performance counters can help to select an optimal value.
 
 3.7 Using Multiple OpenCL Devices
@@ -2762,18 +2762,18 @@ The AMD OpenCL runtime supports both CPU and GPU devices. This section introduce
 3.7.1 CPU and GPU Devices
 ++++++++++++++++++++++++++
 
-Table 3.9 lists some key performance characteristics of two exemplary CPU and GPU devices: a quad-core AMD Phenom II X4 processor running at 2.8 GHz, and a mid-range ATI Radeon™ 5670 GPU running at 750 MHz. The "best" device in each characteristic is highlighted, and the ratio of the best/other device is shown in the final column.
+Table 3.9 lists some key performance characteristics of two exemplary CPU and GPU devices: a quad-core AMD Phenom II X4 processor running at 2.8 GHz, and a mid-range ATI Radeon(TM) 5670 GPU running at 750 MHz. The "best" device in each characteristic is highlighted, and the ratio of the best/other device is shown in the final column.
 
 	**Table 3.9 CPU and GPU Performance Characteristics**
 
 +--------------------------------+-------------------+---------------------+--------------+
 |                                | CPU               | GPU                 | Winner Ratio |
 +================================+===================+=====================+==============+
-| Example Device                 | AMD Phenom™ II X4 | ATI Radeon™ HD 5670 |              |
+| Example Device                 | AMD Phenom(TM) II X4 | ATI Radeon(TM) HD 5670 |              |
 +--------------------------------+-------------------+---------------------+--------------+
 | Core Frequency                 | 2800 MHz          | 750 MHz             | 4 X          |
 +--------------------------------+-------------------+---------------------+--------------+
-| Compute Units                  | 4                 | 5                   | 1.3 X        | 
+| Compute Units                  | 4                 | 5                   | 1.3 X        |
 +--------------------------------+-------------------+---------------------+--------------+
 | Approx. Power1                 | 95 W              | 64 W                | 1.5 X        |
 +--------------------------------+-------------------+---------------------+--------------+
@@ -2798,13 +2798,13 @@ Table 3.9 lists some key performance characteristics of two exemplary CPU and GP
 +--------------------------------+-------------------+---------------------+--------------+
 | L2+L3 cache capacity           | 8192 KB           | 128 kB              | 64 X         |
 +--------------------------------+-------------------+---------------------+--------------+
-| Approx Kernel Launch Latency   | 25 μs             | 225 μs              | 9 X          |
+| Approx Kernel Launch Latency   | 25 us             | 225 us              | 9 X          |
 +--------------------------------+-------------------+---------------------+--------------+
-.. [1] For the power specifications of the AMD Phenom™ II x4, see http://www.amd.com/us/products/desk- top/processors/phenom-ii/Pages/phenom-ii-model-number-comparison.aspx. For the power specifications of the ATI Radeon™ HD 5670, see http://www.amd.com/us/products/desktop/graphics/ati-radeon- hd-5000/ati-radeon-hd-5670-overview/Pages/ati-radeon-hd-5670-specifications.aspx.
+.. [1] For the power specifications of the AMD Phenom(TM) II x4, see http://www.amd.com/us/products/desk- top/processors/phenom-ii/Pages/phenom-ii-model-number-comparison.aspx. For the power specifications of the ATI Radeon(TM) HD 5670, see http://www.amd.com/us/products/desktop/graphics/ati-radeon- hd-5000/ati-radeon-hd-5670-overview/Pages/ati-radeon-hd-5670-specifications.aspx.
 
 The GPU excels at high-throughput: the peak execution rate (measured in FLOPS) is 7X higher than the CPU, and the memory bandwidth is 2.5X higher than the CPU. The GPU also consumes approximately 65% the power of the CPU; thus, for this comparison, the power efficiency in flops/watt is 10X higher. While power efficiency can vary significantly with different devices, GPUs generally provide greater power efficiency (flops/watt) than CPUs because they optimize for throughput and eliminate hardware designed to hide latency.
 
-Conversely, CPUs excel at latency-sensitive tasks. For example, an integer add is 30X faster on the CPU than on the GPU. This is a product of both the CPUs higher clock rate (2800 MHz vs 750 MHz for this comparison), as well as the operation latency; the CPU is optimized to perform an integer add in just one cycle, while the GPU requires eight cycles. The CPU also has a latency- optimized path to DRAM, while the GPU optimizes for bandwidth and relies on many in-flight threads to hide the latency. The ATI Radeon™ HD 5670 GPU, for example, supports more than 15,000 in-flight threads and can switch to a new thread in a single cycle. The CPU supports only four hardware threads, and thread-switching requires saving and restoring the CPU registers from memory. The GPU requires many active threads to both keep the execution resources busy, as well as provide enough threads to hide the long latency of cache misses.
+Conversely, CPUs excel at latency-sensitive tasks. For example, an integer add is 30X faster on the CPU than on the GPU. This is a product of both the CPUs higher clock rate (2800 MHz vs 750 MHz for this comparison), as well as the operation latency; the CPU is optimized to perform an integer add in just one cycle, while the GPU requires eight cycles. The CPU also has a latency- optimized path to DRAM, while the GPU optimizes for bandwidth and relies on many in-flight threads to hide the latency. The ATI Radeon(TM) HD 5670 GPU, for example, supports more than 15,000 in-flight threads and can switch to a new thread in a single cycle. The CPU supports only four hardware threads, and thread-switching requires saving and restoring the CPU registers from memory. The GPU requires many active threads to both keep the execution resources busy, as well as provide enough threads to hide the long latency of cache misses.
 
 Each GPU thread has its own register state, which enables the fast single-cycle switching between threads. Also, GPUs can be very efficient at gather/scatter operations: each thread can load from any arbitrary address, and the registers are completely decoupled from the other threads. This is substantially more flexible and higher-performing than a classic Vector ALU-style architecture (such as SSE on the CPU), which typically requires that data be accessed from contiguous and aligned memory locations. SSE supports instructions that write parts of a register (for example, ``MOVLPS`` and ``MOVHPS``, which write the upper and lower halves, respectively, of an SSE register), but these instructions generate additional microarchitecture dependencies and frequently require additional pack instructions to format the data correctly.
 
@@ -2837,7 +2837,7 @@ For these reasons, a dynamic scheduling algorithm is recommended. In this approa
 Multi-core runtimes, such as Cilk, have already introduced dynamic scheduling algorithms for multi-core CPUs, and it is natural to consider extending these scheduling algorithms to GPUs as well as CPUs. A GPU introduces several new aspects to the scheduling process:
 
 * **Heterogeneous Compute Devices**
-  
+
   Most existing multi-core schedulers target only homogenous computing devices. When scheduling across both CPU and GPU devices, the scheduler must be aware that the devices can have very different performance characteristics (10X or more) for some algorithms. To some extent, dynamic scheduling is already designed to deal with heterogeneous workloads (based on data input the same algorithm can have very different performance, even when run on the same device), but a system with heterogeneous devices makes these cases more common and more extreme. Here are some suggestions for these situations.
   ~ The scheduler should support sending different workload sizes to different devices. GPUs typically prefer larger grain sizes, and higher- performing GPUs prefer still larger grain sizes.
   ~ The scheduler should be conservative about allocating work until after it has examined how the work is being executed. In particular, it is important to avoid the performance cliff that occurs when a slow device is assigned an important long-running task. One technique is to use small grain allocations at the beginning of the algorithm, then switch to larger grain allocations when the device characteristics are well-known.
@@ -2845,7 +2845,7 @@ Multi-core runtimes, such as Cilk, have already introduced dynamic scheduling al
   ~ The scheduler must balance small-grain-size (which increase the adaptiveness of the schedule and can efficiently use heterogeneous devices) with larger grain sizes (which reduce scheduling overhead). Note that the grain size must be large enough to efficiently use the GPU.
 
 * **Asynchronous Launch**
-  
+
   OpenCL devices are designed to be scheduled asynchronously from a command-queue. The host application can enqueue multiple kernels, flush the kernels so they begin executing on the device, then use the host core for other work. The AMD OpenCL implementation uses a separate thread for each command-queue, so work can be transparently scheduled to the GPU in the background.
 
   One situation that should be avoided is starving the high-performance GPU devices. This can occur if the physical CPU core, which must re-fill the device queue, is itself being used as a device. A simple approach to this problem is to dedicate a physical CPU core for scheduling chores. The device fission extension (see the Extensions appendix of the AMD OpenCL User Guide) can be used to reserve a core for scheduling. For example, on a quad-core device, device fission can be used to create an OpenCL device with only three cores.
@@ -2855,7 +2855,7 @@ Multi-core runtimes, such as Cilk, have already introduced dynamic scheduling al
 * **Data Location**
 
   Discrete GPUs use dedicated high-bandwidth memory that exists in a separate address space. Moving data between the device address space and the host requires time-consuming transfers over a relatively slow PCI- Express bus. Schedulers should be aware of this cost and, for example, attempt to schedule work that consumes the result on the same device producing it.
-  
+
   CPU and GPU devices share the same memory bandwidth, which results in additional interactions of kernel executions.
 
 3.7.4 Synchronization Caveats
@@ -2867,11 +2867,11 @@ Command-queues that are configured to execute in-order are guaranteed to complet
 AMD Evergreen GPUs currently do not support the simultaneous execution of multiple kernels. For efficient execution, design a single kernel to use all the available execution resources on the GPU.
 
 The AMD OpenCL implementation spawns a new thread to manage each command queue. Thus, the OpenCL host code is free to manage multiple devices from a single host thread. Note that ``clFinish`` is a blocking operation; the thread that calls ``clFinish`` blocks until all commands in the specified command-queue have been processed and completed. If the host thread is managing multiple devices, it is important to call clFlush for each command- queue before calling ``clFinish``, so that the commands are flushed and execute in parallel on the devices. Otherwise, the first call to ``clFinish`` blocks, the commands on the other devices are not flushed, and the devices appear to execute serially rather than in parallel.
- 
+
 
 For low-latency CPU response, it can be more efficient to use a dedicated spin loop and not call ``clFinish()`` Calling ``clFinish()`` indicates that the application wants to wait for the GPU, putting the thread to sleep. For low latency, the application should use ``clFlush()``, followed by a loop to wait for the event to complete. This is also true for blocking maps. The application should use non- blocking maps followed by a loop waiting on the event. The following provides sample code for this.
 ::
- 
+
  if (sleep)
   {
   // this puts host thread to sleep, useful if power is a consideration or overhead is not a concern
@@ -2890,13 +2890,13 @@ For low-latency CPU response, it can be more efficient to use a dedicated spin l
   // Choose your favorite way to yield, SwitchToThread() for example, in place of Sleep(0)
   }
   }
- 
+
 
 3.7.5 GPU and CPU Kernels
 ++++++++++++++++++++++++++
 While OpenCL provides functional portability so that the same kernel can run on any device, peak performance for each device is typically obtained by tuning the OpenCL kernel for the target device.
 
-Code optimized for the Cypress device (the ATI Radeon™ HD 5870 GPU) typically runs well across other members of the Evergreen family. There are some differences in cache size and LDS bandwidth that might impact some kernels. The Cedar ASIC has a smaller wavefront width and fewer registers (see Section 3.6.4, "Optimizing for Cedar", for optimization information specific to this device).
+Code optimized for the Cypress device (the ATI Radeon(TM) HD 5870 GPU) typically runs well across other members of the Evergreen family. There are some differences in cache size and LDS bandwidth that might impact some kernels. The Cedar ASIC has a smaller wavefront width and fewer registers (see Section 3.6.4, "Optimizing for Cedar", for optimization information specific to this device).
 
 As described in Section 3.9, "Clause Boundaries", CPUs and GPUs have very different performance characteristics, and some of these impact how one writes an optimal kernel. Notable differences include:
 
@@ -2908,7 +2908,7 @@ As described in Section 3.9, "Clause Boundaries", CPUs and GPUs have very differ
 For a balanced solution that runs reasonably well on both devices, developers are encouraged to write the algorithm using float4 vectorization. The GPU is more sensitive to algorithm tuning; it also has higher peak performance potential. Thus, one strategy is to target optimizations to the GPU and aim for reasonable performance on the CPU. For peak performance on all devices, developers can choose to use conditional compilation for key code loops in the kernel, or in some cases even provide two separate kernels. Even with device-specific kernel optimizations, the surrounding host code for allocating memory, launching kernels, and interfacing with the rest of the program generally only needs to be written once.
 
 Another approach is to leverage a CPU-targeted routine written in a standard high-level language, such as C++. In some cases, this code path may already exist for platforms that do not support an OpenCL device. The program uses OpenCL for GPU devices, and the standard routine for CPU devices. Load- balancing between devices can still leverage the techniques described in Section 3.7.3, "Partitioning Work for Multiple Devices".
- 
+
 3.7.6 Contexts and Devices
 +++++++++++++++++++++++++++
 The AMD OpenCL program creates at least one context, and each context can contain multiple devices. Thus, developers must choose whether to place all devices in the same context or create a new context for each device. Generally, it is easier to extend a context to support additional devices rather than duplicating the context for each device: buffers are allocated at the context level (and automatically across all devices), programs are associated with the context, and kernel compilation (via ``clBuildProgram``) can easily be done for all devices in a context. However, with current OpenCL implementations, creating a separate context for each device provides more flexibility, especially in that buffer allocations can be targeted to occur on specific devices. Generally, placing the devices in the same context is the preferred solution.
@@ -2976,8 +2976,8 @@ Table 3.10 lists the throughput of instructions for GPUs.
 |                               | MUL             | 5                             | 5                            |
 +-------------------------------+-----------------+-------------------------------+------------------------------+
 
-Note that single precision MAD operations have five times the throughput of the double-precision rate, and that double-precision is only supported on the AMD Radeon™ HD69XX devices. The use of single-precision calculation is encouraged, if that precision is acceptable. Single-precision data is also half the size of double-precision, which requires less chip bandwidth and is not as demanding on the cache structures.
- 
+Note that single precision MAD operations have five times the throughput of the double-precision rate, and that double-precision is only supported on the AMD Radeon(TM) HD69XX devices. The use of single-precision calculation is encouraged, if that precision is acceptable. Single-precision data is also half the size of double-precision, which requires less chip bandwidth and is not as demanding on the cache structures.
+
 Generally, the throughput and latency for 32-bit integer operations is the same as for single-precision floating point operations.
 
 24-bit integer MULs and MADs have five times the throughput of 32-bit integer multiplies. 24-bit unsigned integers are natively supported only on the Evergreen family of devices and later. Signed 24-bit integers are supported only on the Northern Island family of devices and later. The use of OpenCL built-in functions for ``mul24`` and ``mad24`` is encouraged. Note that ``mul24`` can be useful for array indexing operations.
@@ -2986,7 +2986,7 @@ Packed 16-bit and 8-bit operations are not natively supported; however, in cases
 
 The MAD instruction is an IEEE-compliant multiply followed by an IEEE- compliant add; it has the same accuracy as two separate MUL/ADD operations. No special compiler flags are required for the compiler to convert separate MUL/ADD operations to use the MAD instruction.
 
-Table 3.10 shows the throughput for each stream processing core. To obtain the peak throughput for the whole device, multiply the number of stream cores and the engine clock. For example, according to Table 3.10, a Cypress device can perform two double-precision ADD operations/cycle in each stream core. An ATI Radeon™ HD 5870 GPU has 320 Stream Cores and an engine clock of 850 MHz, so the entire GPU has a throughput rate of (2*320*850 MHz) = 544 GFlops for double-precision adds.
+Table 3.10 shows the throughput for each stream processing core. To obtain the peak throughput for the whole device, multiply the number of stream cores and the engine clock. For example, according to Table 3.10, a Cypress device can perform two double-precision ADD operations/cycle in each stream core. An ATI Radeon(TM) HD 5870 GPU has 320 Stream Cores and an engine clock of 850 MHz, so the entire GPU has a throughput rate of (2*320*850 MHz) = 544 GFlops for double-precision adds.
 
 3.8.2 AMD Media Instructions
 +++++++++++++++++++++++++++++
@@ -3066,31 +3066,31 @@ The OpenCL compiler currently recognizes a few patterns and transforms them into
 
 * Bitfield extract on signed/unsigned integers.
       | (A >> B) & C ==> [u]bit_extract
-      
+
      where
-     
+
       | B and C are compile time constants,
       | A is a 8/16/32bit integer type, and
       | C is a mask.
-      
+
 * Bitfield insert on signed/unsigned integers
       | ((A & B) << C) | ((D & E) << F ==> ubit_insert
-      
+
      where
-      
+
       | B and E have no conflicting bits (B^E == 0),
       | B, C, E, and F are compile-time constants, and
       | B and E are masks.
       | The first bit set in B is greater than the number of bits in E plus the first bit set in E, or the first bit set in E is greater than the number of bits in B plus the first bit set in B.
       | If B, C, E, or F are equivalent to the value 0, this optimization is also supported.
- 
+
 3.9 Clause Boundaries
 ----------------------
 
 AMD GPUs groups instructions into clauses. These are broken at control-flow boundaries when:
 
  * the instruction type changes (for example, from FETCH to ALU), or
- * if the clause contains the maximum amount of operations (the maximum size for an ALU clause is 128 operations). 
+ * if the clause contains the maximum amount of operations (the maximum size for an ALU clause is 128 operations).
 
 ALU and LDS access instructions are placed in the same clause. FETCH, ALU/LDS, and STORE instructions are placed into separate clauses.
 
@@ -3103,7 +3103,7 @@ ALU dependencies on memory operations are handled at the clause level. Specifica
 Switching to another clause in the same wavefront requires approximately 40 cycles. The hardware immediately schedules another wavefront if one is available, so developers are encouraged to provide multiple wavefronts/compute unit. The cost to switch clauses is far less than the memory latency; typically, if the program is designed to hide memory latency, it hides the clause latency as well.
 
 The address calculations for FETCH and STORE instructions execute on the same hardware in the compute unit as do the ALU clauses. The address calculations for memory operations consumes the same executions resources that are used for floating-point computations.
- 
+
  * The ISA dump shows the clause boundaries. See the example shown below. For more information on clauses, see the AMD Evergreen-Family ISA Microcode
 
 And Instructions (v1.0b) and the AMD R600/R700/Evergreen Assembly Language Format documents.
@@ -3111,22 +3111,22 @@ And Instructions (v1.0b) and the AMD R600/R700/Evergreen Assembly Language Forma
 The following is an example disassembly showing clauses. There are 13 clauses in the kernel. The first clause is an ALU clause and has 6 instructions.
 ::
 
- 
+
  00 ALU_PUSH_BEFORE: ADDR(32) CNT(13) KCACHE0(CB1:0-15) KCACHE1(CB0:0-15)
     0	x: MOV		R3.x, KC0[0].x
   	y: MOV		R2.y, KC0[0].y
   	z: MOV		R2.z, KC0[0].z
   	w: MOV		R2.w, KC0[0].w
      1	x: MOV		R4.x, KC0[2].x
-  	y: MOV		R2.y, KC0[2].y 
+  	y: MOV		R2.y, KC0[2].y
   	z: MOV		R2.z, KC0[2].z
   	w: MOV		R2.w, KC0[2].w
   	t: SETGT_INT 	R5.x, PV0.x,	0.0f
       2 	t: MULLO_INT 	__, 	R1.x, 	KC1[1].x
        3 	y: ADD_INT 	__, 	R0.x, 	PS2
       4 	x: ADD_INT 	R0.x, 	PV3.y,	KC1[6].x
-      5 	x: PREDNE_INT	__,	R5.x,	0.0f	UPDATE_EXEC_MASK UPDATE_PRED 
- 
+      5 	x: PREDNE_INT	__,	R5.x,	0.0f	UPDATE_EXEC_MASK UPDATE_PRED
+
  01 JUMP	POP_CNT(1) ADDR(12)
  02 ALU: ADDR(45) CNT(5) KCACHE0(CB1:0-15)
       6	z: LSHL 		__, 	R0.x,
@@ -3135,11 +3135,11 @@ The following is an example disassembly showing clauses. There are 13 clauses in
  03 LOOP_DX10 i0 FAIL_JUMP_ADDR(11)
  04 ALU: ADDR(50) CNT(4)
       9	x: ADD_INT 	R3.x, 	-1,	R3.x
-       	y: LSHR 		R0.y, 	R4.x, 	(0x00000002, 2.802596929e-45f).x 
+       	y: LSHR 		R0.y, 	R4.x, 	(0x00000002, 2.802596929e-45f).x
  	t: ADD_INT	R4.x,	R4.x, 	(0x00000004, 5.605193857e-45f).y
  05 WAIT_ACK: Outstanding_acks <= 0
  06 TEX: ADDR(64) CNT(1)
-      10	VFETCH 		R0.x__, 	R0.y, fc156 MEGA(4) 
+      10	VFETCH 		R0.x__, 	R0.y, fc156 MEGA(4)
  	FETCH_TYPE(NO_INDEX_OFFSET)
  07 ALU: ADDR(54) CNT(3)
       11 	x: MULADD_e 	R0.x, 	R0.x, (0x40C00000, 6.0f).y, 	(0x41880000, 17.0f).x
@@ -3150,8 +3150,8 @@ The following is an example disassembly showing clauses. There are 13 clauses in
  10 ENDLOOP i0 PASS_JUMP_ADDR(4)
  11 POP (1) ADDR(12)
  12 NOP NO_BARRIER
- END_OF_PROGRAM 
- 
+ END_OF_PROGRAM
+
 
 3.10 Additional Performance Guidance
 -------------------------------------
@@ -3165,7 +3165,7 @@ The compiler directive  ``#pragma unroll <unroll-factor>`` can be placed immedia
 Examples for using this loop follow.
 
 No unrolling example::
- 
+
  #pragma unroll 1
  for (int i = 0; i < n; i++) {
  ...
@@ -3176,8 +3176,8 @@ Partial unrolling example::
  #pragma unroll 4
  for (int i = 0; i < 128; i++) {
  ...
- } 
- 
+ }
+
 
 Currently, the unroll pragma requires that the loop boundaries can be determined at compile time. Both loop bounds must be known at compile time. If n is not given, it is equivalent to the number of iterations of the loop when both loop bounds are known. If the unroll-factor is not specified, and the compiler can determine the loop count, the compiler fully unrolls the loop. If the unroll-factor is not specified, and the compiler cannot determine the loop count, the compiler does no unrolling.
 
@@ -3185,7 +3185,7 @@ Currently, the unroll pragma requires that the loop boundaries can be determined
 +++++++++++++++++++++
 
 There are many possible physical memory layouts for images. AMD devices can access memory in a tiled or in a linear arrangement.
- 
+
  * Linear - A linear layout format arranges the data linearly in memory such that element addresses are sequential. This is the layout that is familiar to CPU programmers. This format must be used for OpenCL buffers; it can be used for images.
  * Tiled - A tiled layout format has a pre-defined sequence of element blocks arranged in sequential memory addresses (see Figure 3.11 for a conceptual illustration). A microtile consists of ABIJ; a macrotile consists of the top-left 16 squares for which the arrows are red. Only images can use this format. Translating from user address space to the tiled arrangement is transparent to the user. Tiled memory layouts provide an optimized memory access pattern to make more efficient use of the RAM attached to the GPU compute device. This can contribute to lower latency.
 
@@ -3206,20 +3206,20 @@ Memory access patterns in compute kernels are usually different from those in th
 * Avoid declaring global arrays on the kernel's stack frame as these typically cannot be allocated in registers and require expensive global memory operations.
 * Use predication rather than control-flow. The predication allows the GPU to execute both paths of execution in parallel, which can be faster than attempting to minimize the work through clever control-flow. The reason for this is that if no memory operation exists in a ``?:`` operator (also called a ternary operator), this operation is translated into a single ``cmov_logical`` instruction, which is executed in a single cycle. An example of this is :
   ::
-   
+
    If (A>B) { C += D;
    } else { C -= D;
-   } 
+   }
 
   Replace this with::
-   
+
    int factor = (A>B) ? 1:-1;
    C += factor*D;
 
 
 In the first block of code, this translates into an IF/ELSE/ENDIF sequence of CF clauses, each taking ~40 cycles. The math inside the control flow adds two cycles if the control flow is divergent, and one cycle if it is not. This code executes in ~120 cycles.
 
-In the second block of code, the ``?:`` operator executes in an ALU clause, so no extra CF instructions are generated. Since the instructions are sequentially dependent, this block of code executes in three cycles, for a ~40x speed improvement. To see this, the first cycle is the (A>B) comparison, the result of which is input to the second cycle, which is the ``cmov_logical`` factor, bool, 1, -1. The final cycle is a MAD instruction that: mad C, factor, D, C. If the ratio between CF clauses and ALU instructions is low, this is a good pattern to remove the control flow. 
+In the second block of code, the ``?:`` operator executes in an ALU clause, so no extra CF instructions are generated. Since the instructions are sequentially dependent, this block of code executes in three cycles, for a ~40x speed improvement. To see this, the first cycle is the (A>B) comparison, the result of which is input to the second cycle, which is the ``cmov_logical`` factor, bool, 1, -1. The final cycle is a MAD instruction that: mad C, factor, D, C. If the ratio between CF clauses and ALU instructions is low, this is a good pattern to remove the control flow.
 
  * Loop Unrolling
 	* OpenCL kernels typically are high instruction-per-clock applications. Thus, the overhead to evaluate control-flow and execute branch instructions can consume a significant part of resource that otherwise can be used for high-throughput compute operations.
@@ -3228,10 +3228,10 @@ In the second block of code, the ``?:`` operator executes in an ALU clause, so n
  * When tuning an algorithm, it is often beneficial to code a simple but accurate algorithm that is retained and used for functional comparison. GPU tuning can be an iterative process, so success requires frequent experimentation, verification, and performance measurement.
  * The profiler and analysis tools report statistics on a per-kernel granularity. To narrow the problem further, it might be useful to remove or comment-out sections of code, then re-run the timing and profiling tool.
  * Writing code with dynamic pointer assignment should be avoided on the GPU. For example::
- 
+
    kernel void dyn_assign(global int* a, global int* b, global int* c)
    {
-   global int* d; 
+   global int* d;
    size_t idx = get_global_id(0);
    if (idx & 1) {
    d = b;
@@ -3239,20 +3239,20 @@ In the second block of code, the ``?:`` operator executes in an ALU clause, so n
    d = c;
    }
    a[idx] = d[idx];
-   } 
+   }
+
 
+   This is inefficient because the GPU compiler must know the base pointer that every load comes from and in this situation, the compiler cannot determine what aEUR~d' points to. So, both B and C are assigned to the same GPU resource, removing the ability to do certain optimizations.
 
-   This is inefficient because the GPU compiler must know the base pointer that every load comes from and in this situation, the compiler cannot determine what â€˜d' points to. So, both B and C are assigned to the same GPU resource, removing the ability to do certain optimizations.
- 
  * If the algorithm allows changing the work-group size, it is possible to get better performance by using larger work-groups (more work-items in each work-group) because the workgroup creation overhead is reduced. On the other hand, the OpenCL CPU runtime uses a task-stealing algorithm at the work-group level, so when the kernel execution time differs because it contains conditions and/or loops of varying number of iterations, it might be better to increase the number of work-groups. This gives the runtime more flexibility in scheduling work-groups to idle CPU cores. Experimentation might be needed to reach optimal work-group size.
  * Since the AMD OpenCL runtime supports only in-order queuing, using ``clFinish`` () on a queue and queuing a blocking command gives the same result. The latter saves the overhead of another API command.
    For example::
-     
+
      clEnqueueWriteBuffer(myCQ, buff, **CL_FALSE**, 0, buffSize, input, 0, NULL, NULL);``
      clFinish(myCQ);
-   
+
    is equivalent, for the AMD OpenCL runtime, to::
-     
+
      clEnqueueWriteBuffer(myCQ, buff, **CL_TRUE**, 0, buffSize, input, 0, NULL, NULL);``
 
 
@@ -3262,24 +3262,24 @@ In the second block of code, the ``?:`` operator executes in an ALU clause, so n
  * Porting from CUDA to OpenCL is relatively straightforward. Multiple vendors have documents describing how to do this, including AMD : http://developer.amd.com/tools-and-sdks/opencl-zone/
  * Some specific performance recommendations which differ from other GPU architectures:
 
-	* Use a workgroup size that is a multiple of 64. CUDA code can use a workgroup size of 32; this uses only half the available compute resources on an ATI Radeon™ HD 5870 GPU.
+	* Use a workgroup size that is a multiple of 64. CUDA code can use a workgroup size of 32; this uses only half the available compute resources on an ATI Radeon(TM) HD 5870 GPU.
  	* Vectorization can lead to substantially greater efficiency. The ``ALUPacking`` counter provided by the Profiler can track how well the kernel code is using the five-wide (or four-wide, depending on the GPU type) VLIW unit. Values below 70 percent may indicate that dependencies are preventing the full use of the processor. For some kernels, vectorization can be used to increase efficiency and improve kernel performance.
-	* AMD GPUs have a very high single-precision flops capability (2.72 teraflops in a single ATI Radeon™ HD 5870 GPU). Algorithms that benefit from such throughput can deliver excellent performance on AMD hardware.
+	* AMD GPUs have a very high single-precision flops capability (2.72 teraflops in a single ATI Radeon(TM) HD 5870 GPU). Algorithms that benefit from such throughput can deliver excellent performance on AMD hardware.
 
 
 3.10.5 Guidance for CPU Programmers Using OpenCL to Program GPUs
 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
 OpenCL is the industry-standard toolchain for programming GPUs and parallel devices from many vendors. It is expected that many programmers skilled in CPU programming will program GPUs for the first time using OpenCL. This section provides some guidance for experienced programmers who are programming a GPU for the first time. It specifically highlights the key differences in optimization strategy.
- 
+
  * Study the local memory (LDS) optimizations. These greatly affect the GPU performance. Note the difference in the organization of local memory on the GPU as compared to the CPU cache. Local memory is shared by many work-items (64 on Cypress). This contrasts with a CPU cache that normally is dedicated to a single work-item. GPU kernels run well when they collaboratively load the shared memory.
  * GPUs have a large amount of raw compute horsepower, compared to memory bandwidth and to "control flow" bandwidth. This leads to some high- level differences in GPU programming strategy.
 
 	* A CPU-optimized algorithm may test branching conditions to minimize the workload. On a GPU, it is frequently faster simply to execute the workload.
 	* A CPU-optimized version can use memory to store and later load pre- computed values. On a GPU, it frequently is faster to recompute values rather than saving them in registers. Per-thread registers are a scarce resource on the CPU; in contrast, GPUs have many available per-thread register resources.
- 
+
  * Use ``float4`` and the OpenCL built-ins for vector types `` (vload, vstore,`` etc.). These enable the AMD OpenCL implementation to generate efficient, packed SSE instructions when running on the CPU. Vectorization is an optimization that benefits both the AMD CPU and GPU.
- 
+
 3.10.6 Optimizing Kernel Code
 ++++++++++++++++++++++++++++++
 
@@ -3305,12 +3305,12 @@ The Bulldozer family of CPUs supports FMA4 instructions, exchanging instructions
 There also is hardware support for OpenCL functions that give the new hardware implementation of rotating.
 
 For example::
- 
+
   sum.x += tempA0.x * tempB0.x + tempA0.y * tempB1.x + tempA0.z * tempB2.x + tempA0.w * tempB3.x;
 
 can be written as a composition of mad instructions which use fused multiple add
 (FMA)::
- 
+
   sum.x += mad(tempA0.x, tempB0.x, mad(tempA0.y, tempB1.x, mad(tempA0.z, tempB2.x, tempA0.w*tempB3.x)));
 
 
@@ -3332,7 +3332,7 @@ The AMD CodeXL Static Kernel Analyzer assembler listing lets you view clauses. T
 3.10.7.2 Remove Conditional Assignments
 ########################################
 A conditional of the form "if-then-else" generates branching and thus generates one or more clauses. Use the ``select()`` function to replace these structures with conditional assignments that do not cause branching. For example::
- 
+
  if(x==1) r=0.5;
  if(x==2) r=1.0;
 
@@ -3346,14 +3346,14 @@ Note that if the body of the ``if`` statement contains an I/O, the if statement
 3.10.7.3  Bypass Short-Circuiting
 ##################################
 A conditional expression with many terms can compile into a number of clauses due to the C-language requirement that expressions must short circuit. To prevent this, move the expression out of the control flow statement. For example::
- 
+
  if(a&&b&&c&&d){...}
 
 becomes
 ::
  bool cond = a&&b&&c&&d;
  if(cond){...}
- 
+
 The same applies to conditional expressions used in loop constructs `` (do, while, for)``.
 
 3.10.7.4 Unroll Small Loops
@@ -3364,7 +3364,7 @@ If the loop bounds are known, and the loop is small (less than 16 or 32 instruct
 3.10.7.5 Avoid Nested ifs
 ##########################
 Because the GPU is a Vector ALU architecture, there is a cost to executing an if-then-else block because both sides of the branch are evaluated, then one result is retained while the other is discarded. When if blocks are nested, the results are twice as bad; in general, if blocks are nested k levels deep, there 2^k clauses are generated. In this situation, restructure the code to eliminate nesting.
- 
+
 3.10.7.6 Experiment With do/while/for Loops
 #############################################
 ``for`` loops can generate more clauses than equivalent ``do`` or ``while`` loops. Experiment with these different loop types to find the one with best performance.
diff --git a/Programming_Guides/Opencl-programming-guide.rst b/Programming_Guides/Opencl-programming-guide.rst
index 63b27374..1c57861f 100644
--- a/Programming_Guides/Opencl-programming-guide.rst
+++ b/Programming_Guides/Opencl-programming-guide.rst
@@ -11,13 +11,13 @@ OpenCL Programming Guide
 	* :ref:`Synchronization`
 	* :ref:`Memory-Arch`
 	* :ref:`Example`
-   
-   * :ref:`AMD_Implementation` 
+
+   * :ref:`AMD_Implementation`
 	* :ref:`AMD-ROCm-Implementation`
-	* :ref:`Hardware-Overview-GCNDevices` 
+	* :ref:`Hardware-Overview-GCNDevices`
 	* :ref:`Communication-Host-GPU`
 	* :ref:`Wavefront-Scheduling`
-   
+
    * :ref:`Build_Run_Opencl`
 	* :ref:`Compilin-Host-Program`
 	* :ref:`Compiling-device-programs`
@@ -28,36 +28,36 @@ OpenCL Programming Guide
 	* :ref:`Running-Program`
 	* :ref:`note-on-thread-safety`
 	* :ref:`Toolchain-considerations`
-   
+
    * :ref:`Profiling_OpenCL`
 	* :ref:`AMD-CodeXL-GPU`
-   
-   * :ref:`OpenCL_static` 
+
+   * :ref:`OpenCL_static`
 	* :ref:`Overview`
-	* :ref:`OpenCL-C-Runtime` 
+	* :ref:`OpenCL-C-Runtime`
 	* :ref:`C-Programming-Language`
 	* :ref:`Examples`
-   
-   * :ref:`OpenCL_2.0` 
+
+   * :ref:`OpenCL_2.0`
 	* :ref:`Introduction`
 	* :ref:`Shared-virtual-Memory`
 	* :ref:`Generi`
-	* :ref:`Device-side-enqueue`	
+	* :ref:`Device-side-enqueue`
 	* :ref:`Atomics`
 	* :ref:`Pipes`
 	* :ref:`Program-scope-global-Variables`
 	* :ref:`Image-Enhancements`
 	* :ref:`Non-uniform-work-group-size`
 	* :ref:`Portability-considerations`
-   
+
    * :ref:`OpenCL_Extentions`
    * :ref:`ICD`
    * :ref:`BIF`
    * :ref:`pre_GCN_Devices`
    * :ref:`OpenCL_OpenGL`
    * :ref:`Functions_OpenCL`
-  
-   
+
+
  .. _OpenCL Architecture:
 OpenCL Architecture and AMD Accelerated Parallel Processing Technology
 =======================================================================
@@ -66,13 +66,13 @@ OpenCL Architecture and AMD Accelerated Parallel Processing Technology
 
 Terminology
 ############
-**compute kernel :** 
+**compute kernel :**
 
 To define a compute kernel, it is first necessary to define a kernel. A kernel is a small unit of execution that performs a clearly defined function and that can be executed in parallel. Such a kernel can be executed on each element of an input stream (called an NDRange), or simply at each point in an arbitrary index space. A kernel is analogous and, on some devices identical, to what graphics programmers call a shader program. This kernel is not to be confused with an OS kernel, which controls hardware. The most basic form of an NDRange is simply mapped over input data and produces one output item for each input tuple. Subsequent extensions of the basic model provide random-access functionality, variable output counts, and reduction/accumulation operations. Kernels are specified using the kernel keyword.
 
 A compute kernel is a specific type of kernel that is not part of the traditional graphics pipeline. The compute kernel type can be used for graphics, but its strength lies in using it for non-graphics fields such as physics, AI, modeling, HPC, and various other computationally intensive applications.
 
-In a compute kernel, the work-item spawn order is sequential. This means that on a chip with N work-items per wavefront, the first N work- items go to wavefront 1, the second N work-items go to wavefront 2, etc. Thus, the work-item IDs for wavefront K are in the range (K•N) to ((K+1)•N)-1.
+In a compute kernel, the work-item spawn order is sequential. This means that on a chip with N work-items per wavefront, the first N work- items go to wavefront 1, the second N work-items go to wavefront 2, etc. Thus, the work-item IDs for wavefront K are in the range (KoN) to ((K+1)oN)-1.
 
 **wavefronts and work-groups :**
 
@@ -84,7 +84,7 @@ Work-groups are composed of wavefronts. Best performance is attained when the gr
 
 **local data store(LDS) :**
 
-The LDS is a high-speed, low-latency memory private to each compute unit. It is a full gather/scatter model: a work-group can write anywhere in its allocated space. This model is unchanged for the AMD Radeon™ HD 7XXX series. The constraints of the current LDS model are:
+The LDS is a high-speed, low-latency memory private to each compute unit. It is a full gather/scatter model: a work-group can write anywhere in its allocated space. This model is unchanged for the AMD Radeon(TM) HD 7XXX series. The constraints of the current LDS model are:
 
  * The LDS size is allocated per work-group. Each work-group specifies how much of the LDS it requires. The hardware scheduler uses  	this information to determine which work groups can share a compute unit.
  * Data can only be shared within work-items in a work-group.
@@ -114,14 +114,14 @@ executing kernels for specific devices.
 
 .. image:: images/img1.png
     :align: center
-    
+
 
 
 The devices are capable of running data- and task-parallel work. A kernel can be executed as a function of multi-dimensional domains of indices. Each element is called a work-item; the total number of indices is defined as the global work-size. The global work-size can be divided into sub-domains, called work-groups, and individual work-items within a group can communicate through global or locally shared memory. Work-items are synchronized through barrier or fence operations. Figure 1.1 is a representation of the host/device architecture with a single platform, consisting of a GPU and a CPU.
 
-An OpenCL application is built by first querying the runtime to determine which platforms are present. There can be any number of different OpenCL implementations installed on a single system. The desired OpenCL platform can be selected by matching the platform vendor string to the desired vendor name, such as “Advanced Micro Devices, Inc.” The next step is to create a context. As shown in Figure 1.1, an OpenCL context has associated with it a number of compute devices (for example, CPU or GPU devices),. Within a context, OpenCL guarantees a relaxed consistency between these devices. This means that memory objects, such as buffers or images, are allocated per context; but changes made by one device are only guaranteed to be visible by another device at well-defined synchronization points. For this, OpenCL provides events, with the ability to synchronize on a given event to enforce the correct order of execution.
+An OpenCL application is built by first querying the runtime to determine which platforms are present. There can be any number of different OpenCL implementations installed on a single system. The desired OpenCL platform can be selected by matching the platform vendor string to the desired vendor name, such as "Advanced Micro Devices, Inc." The next step is to create a context. As shown in Figure 1.1, an OpenCL context has associated with it a number of compute devices (for example, CPU or GPU devices),. Within a context, OpenCL guarantees a relaxed consistency between these devices. This means that memory objects, such as buffers or images, are allocated per context; but changes made by one device are only guaranteed to be visible by another device at well-defined synchronization points. For this, OpenCL provides events, with the ability to synchronize on a given event to enforce the correct order of execution.
 
-Many operations are performed with respect to a given context; there also are many operations that are specific to a device. For example, program compilation and kernel execution are done on a per-device basis. Performing work with a device, such as executing kernels or moving data to and from the device’s local memory, is done using a corresponding command queue. A command queue is associated with a single device and a given context; all work for a specific device is done through this interface. Note that while a single command queue can be associated with only a single device, there is no limit to the number of command queues that can point to the same device. For example, it is possible to have one command queue for executing kernels and a command queue for managing data transfers between the host and the device.
+Many operations are performed with respect to a given context; there also are many operations that are specific to a device. For example, program compilation and kernel execution are done on a per-device basis. Performing work with a device, such as executing kernels or moving data to and from the device's local memory, is done using a corresponding command queue. A command queue is associated with a single device and a given context; all work for a specific device is done through this interface. Note that while a single command queue can be associated with only a single device, there is no limit to the number of command queues that can point to the same device. For example, it is possible to have one command queue for executing kernels and a command queue for managing data transfers between the host and the device.
 
 Most OpenCL programs follow the same pattern. Given a specific platform, select a device or devices to create a context, allocate memory, create device-specific command queues, and perform data transfers and computations. Generally, the platform is the gateway to accessing specific devices, given these devices and a corresponding context, the application is independent of the platform. Given a context, the application can:
 
@@ -147,14 +147,14 @@ There are two types of synchronization between commands in a command- queue:
  * command-queue barrier - enforces ordering within a single queue. Any resulting changes to memory are available to the following   	commands in the queue.
  * events - enforces ordering between, or within, queues. Enqueued commands in OpenCL return an event identifying the command as well 	 as the memory object updated by it. This ensures that following commands waiting on that event see the updated memory objects     	before they execute.
 
-OpenCL 2.0 provides additional synchronization options. For an overview, see “Atomics and synchronization.”.
+OpenCL 2.0 provides additional synchronization options. For an overview, see "Atomics and synchronization.".
 
 .. _Memory-Arch:
 
 Memory Architecture and Access
 ###################################
 
-OpenCL has four memory domains: private, local, global, and constant; the AMD Compute Technology system also recognizes host (CPU) and PCI Express®  (PCIe® ) memory.
+OpenCL has four memory domains: private, local, global, and constant; the AMD Compute Technology system also recognizes host (CPU) and PCI Express(R)  (PCIe(R) ) memory.
 
 ============= ====================================================================================================================
  Memory Type   Description
@@ -167,11 +167,11 @@ global	       Accessible to all work-items executing in a context, as well as to
 
 constant       Read-only region for host-allocated and -initialized objects that are not changed during kernel execution.
 
-host (CPU)     Host-accessible region for an application’s data structures and program data.
+host (CPU)     Host-accessible region for an application's data structures and program data.
 
 PCIe	       Part of host (CPU) memory accessible from, and modifiable by, the host program and the GPU compute device. Modifying 		       this memory requires synchronization between the GPU compute device and the CPU.
 ============= ====================================================================================================================
- 
+
 				**Table: illustrates the interrelationship of the memories.**
 
 .. image:: images/img2.png
@@ -222,7 +222,7 @@ Dataflow in Memory Hierarchy
 .. image:: images/img5.png
     :align: center
 
-To load data into LDS from global memory, it is read from global memory and placed into the work-item’s registers; then, a store is performed to LDS. Similarly, to store data into global memory, data is read from LDS and placed into the work- item’s registers, then placed into global memory. To make effective use of the LDS, an algorithm must perform many operations on what is transferred between global memory and LDS. It also is possible to load data from a memory buffer directly into LDS, bypassing VGPRs.
+To load data into LDS from global memory, it is read from global memory and placed into the work-item's registers; then, a store is performed to LDS. Similarly, to store data into global memory, data is read from LDS and placed into the work- item's registers, then placed into global memory. To make effective use of the LDS, an algorithm must perform many operations on what is transferred between global memory and LDS. It also is possible to load data from a memory buffer directly into LDS, bypassing VGPRs.
 
 LDS atomics are performed in the LDS hardware. (Thus, although ALUs are not directly used for these operations, latency is incurred by the LDS executing this function.) If the algorithm does not require write-to-read reuse (the data is read only), it usually is better to use the image dataflow (see right side of Figure 1.5) because of the cache hierarchy.
 
@@ -256,7 +256,7 @@ Image reads are done by addressing the desired location in the input memory usin
 Image reads are cached through the texture system (corresponding to the L2 and
 L1 caches).
 
-.. _Example:  
+.. _Example:
 
 Example Programs
 ###################
@@ -282,7 +282,7 @@ This sample shows a minimalist OpenCL C program that sets a given buffer to some
 7. The data is mapped to the host for examination. Calling clEnqueueMapBuffer ensures the visibility of the buffer on the host, which in this case probably includes a physical transfer. Alternatively, we could use ``clEnqueueWriteBuffer()``, which requires a pre-allocated host-side buffer.
 
 
-**Example Code 1** 
+**Example Code 1**
 
 ::
 
@@ -381,22 +381,22 @@ Example: SAXPY Function
 This section provides an introductory sample for beginner-level OpenCL
 programmers using C++ bindings.
 
-The sample implements the SAXPY function (Y = aX + Y, where X and Y are vectors, and a is a scalar). The full code is reproduced at the end of this section. It uses C++ bindings for OpenCL. These bindings are available in the CL/cl.hpp file in the AMD Compute SDK; they also are downloadable from the Khronos website: http://www.khronos.org/registry/cl 
+The sample implements the SAXPY function (Y = aX + Y, where X and Y are vectors, and a is a scalar). The full code is reproduced at the end of this section. It uses C++ bindings for OpenCL. These bindings are available in the CL/cl.hpp file in the AMD Compute SDK; they also are downloadable from the Khronos website: http://www.khronos.org/registry/cl
 
 The following steps guide you through this example.
 
 1. Enable error checking through the exception handling mechanism in the C++
    bindings by using the following define.
    ::
-   
+
    #define  CL ENABLE_EXCEPTIONS
 
    This removes the need to error check after each OpenCL call. If there is an error, the C++ bindings code throw an exception that is caught at the end of the try block, where we can clean up the host memory allocations. In this example, the C++ object representing OpenCL resources (cl::Context, cl::CommandQueue, etc.) are declared as automatic variables, so they do not need to be released. If an OpenCL call returns an error, the error code is defined in the CL/cl.h file.
 
 2. The kernel is very simple: each work-item, i, does the SAXPY calculation for its corresponding elements ``Y[i] = aX[i] + Y[i]``. Both X and Y vectors are stored in global memory; X is read-only, Y is read-write.
 
-   :: 
-    
+   ::
+
     kernel void saxpy(const __global float * X,
                             __global float * Y,
                       const float a)
@@ -414,31 +414,31 @@ The following steps guide you through this example.
 4. Create an OpenCL context on that platform.
 
    ::
-    
+
     cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*iter)(), 0 };
     context = cl::Context(CL_DEVICE_TYPE_GPU, cps);
 
 5. Get OpenCL devices from the context.
    ::
-   
+
     devices = context.getInfo<CL_CONTEXT_DEVICES>();
 
 6. Create an OpenCL command queue.
 
    ::
-    
+
     queue = cl::CommandQueue(context, devices[0]);
 
 7. Create two buffers, corresponding to the X and Y vectors. Ensure the host- side buffers, pX and pY, are allocated and initialized. 	 The CL_MEM_COPY_HOST_PTR flag instructs the runtime to copy over the contents of the host pointer pX in order to initialize the   	buffer bufX. The bufX buffer uses the CL_MEM_READ_ONLY flag, while bufY requires the CL_MEM_READ_WRITE flag.
 
    ::
-    
+
     bufX = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * length, pX);
 
 8. Create a program object from the kernel source string, build the program for our devices, and create a kernel object corresponding to the SAXPY kernel. (At this point, it is possible to create multiple kernel objects if there are more than one.)
 
    ::
-   
+
     cl::Program::Sources sources(1, std::make_pair(kernelStr.c_str(), kernelStr.length()));
     program = cl::Program(context, sources);
     program.build(devices);
@@ -448,17 +448,17 @@ The following steps guide you through this example.
 
    Set each argument individually in separate kernel.setArg() calls. The arguments, do not need to be set again for subsequent kernelenqueue calls. Reset only those arguments that are to pass a new value to the kernel. Then, enqueue the kernel to the command queue with the appropriate global and local work sizes.
    ::
-    
+
     kernel.setArg(0,bufX); kernel.setArg(1,bufY); kernel.setArg(2,a);
     queue.enqueueNDRangeKernel(kernel, cl::NDRange(), cl::NDRange(length), cl::NDRange(64));
 
 10. Read back the results from bufY to the host pointer pY. We will make this a blocking call (using the CL_TRUE argument) since we 	do not want to proceed before the kernel has finished execution and we have our results back.
     ::
-      
+
       queue.enqueueReadBuffer(bufY, CL_TRUE, 0, length * sizeof(cl_float), pY);
 
 11. Clean up the host resources (pX and pY). OpenCL resources is cleaned up by the C++ bindings support code.
-    
+
     The catch(cl::Error err) block handles exceptions thrown by the C++ bindings code. If there is an OpenCL call error, it prints  	out the name of the call and the error code (codes are defined in CL/cl.h). If there is a kernel compilation error, the error   	code is CL_BUILD_PROGRAM_FAILURE, in which case it is necessary to print out the build log.
 
 **Example Code 2**
@@ -490,7 +490,7 @@ The following steps guide you through this example.
       cout << arrayData[i] << " ";
     cout << endl;
   }
- 
+
   /////////////////////////////////////////////////////////////////
   // Globals
   /////////////////////////////////////////////////////////////////
@@ -520,7 +520,7 @@ The following steps guide you through this example.
       "   uint gid = get_global_id(0);\n"
       "   y[gid] = a* x[gid] + y[gid];\n"
       "}\n";
- 
+
   /////////////////////////////////////////////////////////////////
   // Allocate and initialize memory on the host
   /////////////////////////////////////////////////////////////////
@@ -583,7 +583,7 @@ The following steps guide you through this example.
           break;
         }
       }
- 
+
       /////////////////////////////////////////////////////////////////
       // Create an OpenCL context
       /////////////////////////////////////////////////////////////////
@@ -678,7 +678,7 @@ The code is written so that it performs very well on either CPU or GPU. The numb
 
 The sample includes a number of programming techniques useful for simple tests. Only minimal error checking and resource tear-down is used.
 
-Runtime Code –
+Runtime Code -
 
 1. The source memory buffer is allocated, and initialized with a random pattern.
    Also, the actual min() value for this data set is serially computed, in order to later verify the parallel result.
@@ -694,8 +694,8 @@ Runtime Code –
 5. After the kernels are built, the code prints errors that occurred during kernel compilation and linking.
 
 6. The main loop is set up so that the measured timing reflects the actual kernel performance. If a sufficiently large NLOOPS is     chosen, effects from kernel launch time and delayed buffer copies to the device by the CL runtime are minimized. Note that while  	only a single clFinish() is executed at the end of the timing run, the two kernels are always linked using an event to ensure     	serial execution.
-   
-   The bandwidth is expressed as “number of input bytes processed.” For high- end graphics cards, the bandwidth of this algorithm is   	about an order of magnitude higher than that of the CPU, due to the parallelized memory subsystem of the graphics card.
+
+   The bandwidth is expressed as "number of input bytes processed." For high- end graphics cards, the bandwidth of this algorithm is   	about an order of magnitude higher than that of the CPU, due to the parallelized memory subsystem of the graphics card.
 
 7. The results then are checked against the comparison value. This also establishes that the result is the same on both CPU and GPU, 	which can serve as the first verification test for newly written kernel code.
 
@@ -703,7 +703,7 @@ Runtime Code –
 
 9. You can use the Timer.cpp and Timer.h files from the TransferOverlap sample, which is in the SDK samples.
 
-Kernel Code –
+Kernel Code -
 
 10. The code uses four-component vectors (uint4) so the compiler can identify concurrent execution paths as often as possible. On the GPU, this can be used to further optimize memory accesses and distribution across ALUs. On the CPU, it can be used to enable SSE  like execution.
 
@@ -755,7 +755,7 @@ Kernel Code –
   "  uint idx   = (dev == 0) ? get_global_id(0) * count                             \n"
   "                          :  get_global_id(0);                                   \n"
   "  uint stride = (dev == 0) ? 1 : get_global_size(0);                             \n"
-  "  uint pmin  = (uint) -1;                                                        \n" 	
+  "  uint pmin  = (uint) -1;                                                        \n"
   "  // 11. First, compute private min, for this work-item.                         \n"
   "  for( int n=0; n < count; n++, idx += stride )                                  \n"
   "  {                                                                              \n"
@@ -779,7 +779,7 @@ Kernel Code –
   "  {                                                                              \n"
   "    dbg[0] = get_num_groups(0);                                                  \n"
   "    dbg[1] = get_global_size(0);                                                 \n"
-  "    dbg[2] = count;                                                              \n" 
+  "    dbg[2] = count;                                                              \n"
   "    dbg[3] = stride;                                                             \n"
   "  }                                                                              \n"
   "}                                                                                \n"
@@ -790,7 +790,7 @@ Kernel Code –
   "{                                                                                \n"
   "  (void) atom_min( gmin, gmin[get_global_id(0)] );                               \n"
   "};                                                                               \n";
-  
+
   int main(int argc, char ** argv)
   {
     cl_platform_id	platform;
@@ -819,7 +819,7 @@ Kernel Code –
 
     // Get a platform.
     clGetPlatformIDs( 1, &platform, NULL );
-    
+
     // 3. Iterate over devices.
     for(dev=0; dev < NDEVS; dev++)
     {
@@ -1038,8 +1038,8 @@ The AMD ROCm software stack provides end-users and developers with a complete, f
 The software includes the following components:
 
   * OpenCL compiler and runtime
-  * Debugging and Performance Profiling Tools – AMD CodeXL.
-  * Performance Libraries – clMath and other OpenCL accelerated libraries for optimized NDRange-specific algorithms.
+  * Debugging and Performance Profiling Tools - AMD CodeXL.
+  * Performance Libraries - clMath and other OpenCL accelerated libraries for optimized NDRange-specific algorithms.
 
 The latest generations of AMD GPUs use unified shader architectures capable of running different kernel types interleaved on the same hardware.Programmable GPU compute devices execute various user-developed programs,known to graphics programmers as shaders and to compute programmers as kernels. These GPU compute devices can execute non-graphics functions using a data-parallel programming model that maps executions onto compute units. Each compute unit contains one (pre-GCN devices) or more (GCN devices) vector (SIMD) units. In this programming model, known as AMD Accelerated Parallel Processing Technology, arrays of input data elements stored in memory are accessed by a number of compute units.
 
@@ -1065,11 +1065,11 @@ OpenCL maps the total number of work-items to be launched onto an n- dimensional
 Work-Item Processing
 *****************************
 
-All processing elements within a vector unit execute the same instruction in each cycle. For a typical instruction, 16 processing elements execute one instruction for 64 work items over 4 cycles. The block of work-items that are executed together is called a wavefront. For example, on the AMD Radeon™ HD 290X
+All processing elements within a vector unit execute the same instruction in each cycle. For a typical instruction, 16 processing elements execute one instruction for 64 work items over 4 cycles. The block of work-items that are executed together is called a wavefront. For example, on the AMD Radeon(TM) HD 290X
 
 compute device, the 16 processing elements within each vector unit execute the same instruction for four cycles, which effectively appears as a 64-wide compute unit in execution width.
 
-The size of wavefronts can differ on different GPU compute devices. For example, some of the low-end and older GPUs, such as the AMD Radeon™ HD 54XX series graphics cards, have a wavefront size of 32 work-items. Higher-end and newer AMD GPUs have a wavefront size of 64 work-items.
+The size of wavefronts can differ on different GPU compute devices. For example, some of the low-end and older GPUs, such as the AMD Radeon(TM) HD 54XX series graphics cards, have a wavefront size of 32 work-items. Higher-end and newer AMD GPUs have a wavefront size of 64 work-items.
 
 Compute units operate independently of each other, so it is possible for different compute units to execute different instructions. It is also possible for different vector units within a compute unit to execute different instructions.
 
@@ -1123,8 +1123,8 @@ executes on an ALU, as shown in Figure 2.4).
 
 In GCN devices, each CU includes one Scalar Unit and four Vector (SIMD) units, each of which contains an array of 16 processing elements (PEs). Each PE contains one ALU. Each SIMD unit simultaneously executes a single operation across 16 work items, but each can be working on a separate wavefront.
 
-For example, for the AMD Radeon™ HD 79XX devices each of the 32 CUs has one Scalar Unit and four Vector Units. Figure 2.5 shows only two compute engines/command processors of the array that comprises the compute device of
-the AMD Radeon™ HD 79XX family.
+For example, for the AMD Radeon(TM) HD 79XX devices each of the 32 CUs has one Scalar Unit and four Vector Units. Figure 2.5 shows only two compute engines/command processors of the array that comprises the compute device of
+the AMD Radeon(TM) HD 79XX family.
 
 
 .. image:: images/2.5.png
@@ -1138,7 +1138,7 @@ The Asynchronous Compute Engines (ACEs) manage the CUs; a graphics command proce
 
 Key differences between pre-GCN and GCN devices
 ***********************************************
-In pre-GCN devices (for a hardware overview, see Appendix D, “Hardware overview of pre-GCN devices.”), each compute unit consists of a single vector unit, each containing up to 16 processing elements. Each processing element, which contains 4 or 5 ALUs, could execute bundles of 4 or 5 independent instructions co-issued in a VLIW (Very Long Instruction Word) format. All the processing elements within a vector unit execute a single wavefront (a group of
+In pre-GCN devices (for a hardware overview, see Appendix D, "Hardware overview of pre-GCN devices."), each compute unit consists of a single vector unit, each containing up to 16 processing elements. Each processing element, which contains 4 or 5 ALUs, could execute bundles of 4 or 5 independent instructions co-issued in a VLIW (Very Long Instruction Word) format. All the processing elements within a vector unit execute a single wavefront (a group of
 64 work items). If operations within a wavefront contain dependencies, they cannot be scheduled in the same clock cycle, leaving some ALUs un-utilized. In such cases, some processing elements (and hence, vector units) remain under- utilized.
 
 In GCN devices, the CUs are arranged in four vector unit arrays consisting of 16 processing elements each. Each of these arrays executes a single instruction across each lane for each block of 16 work-items. That instruction is repeated over four cycles to make the 64-element vector called a wavefront.
@@ -1161,13 +1161,13 @@ Each ACE contains up to eight hardware queues and, together with the graphics co
 
 Devices in the Southern Islands families typically have two ACEs. The ACE engines on the Southern Islands families are single-threaded, which means that they contain two hardware queues.
 
-Devices in the Sea Islands and Volcanic Islands families contain between four and eight ACEs, and are multi-threaded (thereby supporting more hardware queues) so they offer more performance. For example, the AMD Radeon™ R9
+Devices in the Sea Islands and Volcanic Islands families contain between four and eight ACEs, and are multi-threaded (thereby supporting more hardware queues) so they offer more performance. For example, the AMD Radeon(TM) R9
 290X devices, in the VI family contain 8 ACEs and 44 CUs.
 
 
 A note on hardware queues
 **************************
-A hardware queue can be thought of as a GPU entry point. The GPU can process kernels from several compute queues concurrently. All hardware queues ultimately share the same compute cores. The use of multiple hardware queues is beneficial when launching small kernels that do not fully saturate the GPU. For example, the AMD Radeon™ HD 290X compute device can execute up to
+A hardware queue can be thought of as a GPU entry point. The GPU can process kernels from several compute queues concurrently. All hardware queues ultimately share the same compute cores. The use of multiple hardware queues is beneficial when launching small kernels that do not fully saturate the GPU. For example, the AMD Radeon(TM) HD 290X compute device can execute up to
 112,640 threads concurrently. The GPU can execute two kernels each spawning
 56320 threads (assuming fully occupancy) twice as fast if launched concurrently through two hardware queues than serially through a single hardware queue.
 
@@ -1232,7 +1232,7 @@ Wavefront Scheduling
 #####################
 GPU compute devices are very efficient at parallelizing large numbers of work- items in a manner transparent to the application. Each GPU compute device uses the large number of wavefronts to hide memory access latencies by having the resource scheduler switch the active wavefront in a given compute unit whenever the current wavefront is waiting for a memory access to complete. Hiding memory access latencies requires that each work-item contain a large number of ALU operations per memory load/store.
 
-Figure 2.6 shows the timing of a simplified execution of wavefronts in a single compute unit. At time 0, the wavefronts are queued and waiting for execution. In this example, only four wavefronts (T0…T3) are scheduled for the compute unit. The hardware limit for the number of active wavefront is dependent on the resource usage (such as the number of active registers used) of the program being executed. An optimally programmed GPU compute device typically has many of active wavefronts.
+Figure 2.6 shows the timing of a simplified execution of wavefronts in a single compute unit. At time 0, the wavefronts are queued and waiting for execution. In this example, only four wavefronts (T0...T3) are scheduled for the compute unit. The hardware limit for the number of active wavefront is dependent on the resource usage (such as the number of active registers used) of the program being executed. An optimally programmed GPU compute device typically has many of active wavefronts.
 
 
 .. image:: images/2.6.png
@@ -1242,7 +1242,7 @@ At runtime, wavefront T0 executes until cycle 20; at this time, a stall occurs d
 
 If the data wavefront T0 is waiting for has returned from memory, T0 continues execution. In the example in Figure 2.6, the data is ready, so T0 continues. Since there were enough wavefronts and processing element operations to cover the long memory latencies, the compute unit does not idle. This method of memory latency hiding helps the GPU compute device achieve maximum performance.
 
-If none of T0 – T3 are runnable, the compute unit waits (stalls) until one of T0 – T3 is ready to execute. In the example shown in Figure 2.7, T0 is the first to continue execution.
+If none of T0 - T3 are runnable, the compute unit waits (stalls) until one of T0 - T3 is ready to execute. In the example shown in Figure 2.7, T0 is the first to continue execution.
 
 
 .. image:: images/2.7.png
@@ -1260,7 +1260,7 @@ An OpenCL application consists of a host program (C/C++) and an optional kernel
 
 Compiling the Host Program
 ###########################
-In order to compile the host program, users must install the OpenCL Compiler and language runtime on the ROCm, On Ubuntu is rocm-opencl-dev which provides all the necessary OpenCL runtime headers and libraries required by the host compiler. If wish to support application build with the historical  APPS SDK sets an environmental variable named AMDAPPSDKROOT to the path of the directory in which the ROCm OpenCL is installed. It should be /opt/rocm/opencl.  The runtime headers and libraries are placed in the install directory under the “include” and “lib” sub-folders, respectively. 
+In order to compile the host program, users must install the OpenCL Compiler and language runtime on the ROCm, On Ubuntu is rocm-opencl-dev which provides all the necessary OpenCL runtime headers and libraries required by the host compiler. If wish to support application build with the historical  APPS SDK sets an environmental variable named AMDAPPSDKROOT to the path of the directory in which the ROCm OpenCL is installed. It should be /opt/rocm/opencl.  The runtime headers and libraries are placed in the install directory under the "include" and "lib" sub-folders, respectively.
 
 While building the host program, these headers and libraries must be included in the project by choosing the appropriate options for the targeted operating system, IDE, and compiler.
 
@@ -1273,13 +1273,13 @@ To compile OpenCL applications on Linux, gcc or the Intel C compiler must be ins
 
 1. Compile all the C++ files (Template.cpp), and get the object files.
   64-bit object files on 64-bit system::
-     
+
      g++ -o Template.o -c Template.cpp -I$ROCMOPENCL/include
 
 2. Link all the object files generated in the previous step to the OpenCL library and create an executable.
 
    For linking to a 64-bit library::
-   
+
      g++ -o Template Template.o -lOpenCL -L$ROCMOPENCL/lib/x86_64
 
 
@@ -1326,7 +1326,7 @@ Note: Most of the examples in this chapter are shown using runtime C APIs. In or
 
 **Example creation of program objects from an external file :**
 
-:: 
+::
 
    std::ifstream f("my_kernel.cl");
    std::stringstream st;
@@ -1366,40 +1366,40 @@ Suppose a program object has been created as follows:
 
 Next, the program object can be built for all the devices in the context or for a list of selected devices.
 
-* To build the program for all the devices, “NULL” must be passed against the target device list argument, as shown below:     	
+* To build the program for all the devices, "NULL" must be passed against the target device list argument, as shown below:
+
+::
 
-:: 
-    
   clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
 
 * To build for any particular GPU device or a list of devices :
-  
-:: 
-    
-  int nDevices = 0; 
+
+::
+
+  int nDevices = 0;
   clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &nDevices);
   cl_device_id * devices = malloc(nDevices * sizeof(cl_device_id));
   clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, nDevices * sizeof(cl_device_id), devices, NULL);
 
 * To build for the nth GPU device in a list of devices:
 
-:: 
-    
+::
+
   clBuildProgram(program, 1, &devices[n], NULL, NULL, NULL);
 
 * To build for the first n number of GPU devices
 
-:: 
+::
     clBuildProgram(program, n, devices, NULL, NULL, NULL);
 
 
 **Build Options:**
 
-A list of options can be passed during program build to control each stage of the building process. The full list includes various categories of options, such as preprocessor, compiler, optimization, linker, and debugger. Some of them are standard (specified by Khronos); others are vendor-specific. For details about the standard options, see the clBuildProgram API’s description in the OpenCL specification.
+A list of options can be passed during program build to control each stage of the building process. The full list includes various categories of options, such as preprocessor, compiler, optimization, linker, and debugger. Some of them are standard (specified by Khronos); others are vendor-specific. For details about the standard options, see the clBuildProgram API's description in the OpenCL specification.
 
-For information about the frequently used standard build options, see  “Supported Standard OpenCL Compiler Options”.
+For information about the frequently used standard build options, see  "Supported Standard OpenCL Compiler Options".
 
-For information about AMD-developed supplemental options and environment variables, see  “AMD-Developed Supplemental Compiler Options”.
+For information about AMD-developed supplemental options and environment variables, see  "AMD-Developed Supplemental Compiler Options".
 
 **Special note for building OpenCL 2.0 programs:**
 
@@ -1415,7 +1415,7 @@ OpenCL provides a way to check and query the compilation/linking errors that occ
 
 **Example:**
 
-:: 
+::
 
   cl_int err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
   if (err != CL_SUCCESS)
@@ -1440,8 +1440,8 @@ The user must compile each program object separately. This step may be a little
 **Example (derived from the OpenCL specification):**
 
 Consider the following program source:
- 
-:: 
+
+::
 
   #include <foo.h>
   #include <mydir/myinc.h>
@@ -1453,7 +1453,7 @@ Consider the following program source:
 
 This kernel includes two headers, foo.h and mydir/myinc.h. So first create the program objects corresponding to each header as follows:
 
-:: 
+::
 
   cl_program foo_pg = clCreateProgramWithSource(context, 1, &foo_header_src, NULL, &err);
 
@@ -1466,10 +1466,10 @@ Suppose the program source described above is given by program_A and is loaded v
 
 Now, these headers can be passed as embedded headers along with the program object
 
-:: 
+::
 
-   cl_program input_headers[2] = { foo_pg, myinc_pg }; 
-   char * input_header_names[2] = { “foo.h”, “mydir/myinc.h” };
+   cl_program input_headers[2] = { foo_pg, myinc_pg };
+   char * input_header_names[2] = { "foo.h", "mydir/myinc.h" };
 
    clCompileProgram(program_A, 0, NULL, // num_devices & device_list
       NULL, // compile_options
@@ -1508,8 +1508,8 @@ Supported Standard OpenCL Compiler Options
 ###########################################
 The frequently-used build options are:
 
- * -I dir — Add the directory dir to the list of directories to be searched for header files. When parsing #include directives, the 	OpenCL compiler resolves relative paths using the current working directory of the application.
- * -D name — Predefine name as a macro, with definition = 1. For -D name=definition, the contents of definition are tokenized and processed as if they appeared during the translation phase three  in a #define directive. In particular, the definition is truncated by embedded newline characters.
+ * -I dir -- Add the directory dir to the list of directories to be searched for header files. When parsing #include directives, the 	OpenCL compiler resolves relative paths using the current working directory of the application.
+ * -D name -- Predefine name as a macro, with definition = 1. For -D name=definition, the contents of definition are tokenized and processed as if they appeared during the translation phase three  in a #define directive. In particular, the definition is truncated by embedded newline characters.
    -D options are processed in the order they are given in the options argument to ``clBuildProgram``.
 
 For additional build options, see the :ref:OpenCL specification.
@@ -1521,16 +1521,16 @@ AMD-Developed Supplemental Compiler Options
 
 The following supported options are not part of the OpenCL specification:
 
- * -g — This is an experimental feature that lets you use the GNU project debugger, GDB, to debug kernels on x86 CPUs running Linux or
-   cygwin/minGW under Windows. For more details, see Chapter 4, “Debugging and Profiling OpenCL.” This option does not affect the    	default optimization of the OpenCL code.
- * -O0 — Specifies to the compiler not to optimize. This is equivalent to the OpenCL standard option -cl-opt-disable.
- * -f[no-]bin-source — Does [not] generate OpenCL source in the .source section. For more information, see Appendix C, “OpenCL BinaryImage Format (BIF) v2.0.” by default, this option does NOT generate the source.
- * -f[no-]bin-llvmir — Does [not] generate LLVM IR in the .llvmir section.
-   For more information, see Appendix C, “OpenCL Binary Image Format (BIF) v2.0.” By default, this option GENERATES the LLVM IR.
- * -f[no-]bin-amdil — Does [not] generate AMD IL in the .amdil section. For more information, see Appendix C, “OpenCL Binary Image  	Format (BIF) v2.0.” By default, this option does NOT generate the AMD IL.
- * -f[no-]bin-exe — Does [not] generate the executable (ISA) in the .text section. For more information, see Appendix C, “OpenCL    	Binary Image Format (BIF) v2.0.” By default, this option GENERATES the ISA.
- * -f[no-]bin-hsail — Does [not] generate HSAIL/BRIG in the binary. By default, this option does NOT generate HSA IL/BRIG in the    	binary.
- * -save-temps[=<prefix>] — This option dumps intermediate temporary files, such as IL and ISA code, for each OpenCL kernel. If      	<prefix> is not given, temporary files are saved in the default temporary directory (the current directory for Linux, C:\Users    	\<user>\AppData\Local for Windows). If <prefix> is given, those temporary files are saved with the given
+ * -g -- This is an experimental feature that lets you use the GNU project debugger, GDB, to debug kernels on x86 CPUs running Linux or
+   cygwin/minGW under Windows. For more details, see Chapter 4, "Debugging and Profiling OpenCL." This option does not affect the    	default optimization of the OpenCL code.
+ * -O0 -- Specifies to the compiler not to optimize. This is equivalent to the OpenCL standard option -cl-opt-disable.
+ * -f[no-]bin-source -- Does [not] generate OpenCL source in the .source section. For more information, see Appendix C, "OpenCL BinaryImage Format (BIF) v2.0." by default, this option does NOT generate the source.
+ * -f[no-]bin-llvmir -- Does [not] generate LLVM IR in the .llvmir section.
+   For more information, see Appendix C, "OpenCL Binary Image Format (BIF) v2.0." By default, this option GENERATES the LLVM IR.
+ * -f[no-]bin-amdil -- Does [not] generate AMD IL in the .amdil section. For more information, see Appendix C, "OpenCL Binary Image  	Format (BIF) v2.0." By default, this option does NOT generate the AMD IL.
+ * -f[no-]bin-exe -- Does [not] generate the executable (ISA) in the .text section. For more information, see Appendix C, "OpenCL    	Binary Image Format (BIF) v2.0." By default, this option GENERATES the ISA.
+ * -f[no-]bin-hsail -- Does [not] generate HSAIL/BRIG in the binary. By default, this option does NOT generate HSA IL/BRIG in the    	binary.
+ * -save-temps[=<prefix>] -- This option dumps intermediate temporary files, such as IL and ISA code, for each OpenCL kernel. If      	<prefix> is not given, temporary files are saved in the default temporary directory (the current directory for Linux, C:\Users    	\<user>\AppData\Local for Windows). If <prefix> is given, those temporary files are saved with the given
    <prefix>. If <prefix> is an absolute path prefix, such as
    C:\your\work\dir\mydumpprefix, those temporaries are saved under C:\your\work\dir, with mydumpprefix as prefix to all temporary   	names. For example,
 
@@ -1540,13 +1540,13 @@ The following supported options are not part of the OpenCL specification:
   | _temp_nn_xxx_yyy.il,  _temp_nn_xxx_yyy.isa
 
   |
-  
+
   | -save-temps=aaa
   | under the default directory
   | aaa_nn_xxx_yyy.il,  aaa_nn_xxx_yyy.isa
 
   |
- 
+
   | -save-temps=C:\you\dir\bbb
   | under C:\you\dir
   | bbb_nn_xxx_yyy.il,  bbb_nn_xxx_yyy.isa
@@ -1556,8 +1556,8 @@ where xxx and yyy are the device name and kernel name for this build, respective
 
 To avoid source changes, there are two environment variables that can be used to change CL options during the runtime.
 
-* AMD_OCL_BUILD_OPTIONS — Overrides the CL options specified in clBuildProgram().
-* AMD_OCL_BUILD_OPTIONS_APPEND — Appends options to those specified in clBuildProgram().
+* AMD_OCL_BUILD_OPTIONS -- Overrides the CL options specified in clBuildProgram().
+* AMD_OCL_BUILD_OPTIONS_APPEND -- Appends options to those specified in clBuildProgram().
 
 .. _Creating-device-specific-binaries:
 
@@ -1567,7 +1567,7 @@ To generate pre-built device-specific binaries from the OpenCL C source or from
 
 1. Create the program object from OpenCL C source using clCreateProgramWithSource().
 
-2.   Build (i.e. compile and link) the program object (for details, see the “Generating program executable” section).
+2.   Build (i.e. compile and link) the program object (for details, see the "Generating program executable" section).
 
 3.   Read the device-specific binaries from the program object using clGetProgramInfo() as shown below:
 
@@ -1576,20 +1576,20 @@ To generate pre-built device-specific binaries from the OpenCL C source or from
    //Get the number of devices attached with program object
    cl_uint nDevices = 0;
    clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &nDevices, NULL);
- 
+
    //Get the Id of all the attached devices
    cl_device_id *devices = new cl_device_id[nDevices]; clGetProgramInfo(program, CL_PROGRAM_DEVICES, sizeof(cl_device_id) * nDevices, devices, NULL);
-   
+
    // Get the sizes of all the binary objects
    size_t *pgBinarySizes = new size_t[nDevices]; lGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * nDevices, pgBinarySizes, NULL);
-  
+
    // Allocate storage for each binary objects
    unsigned char **pgBinaries = new unsigned char*[nDevices];
    for (cl_uint i = 0; i < nDevices; i++)
    {
      pgBinaries[i] = new unsigned char[pgBinarySizes[i]];
    }
-  
+
    // Get all the binary objects
    clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*) * nDevices, pgBinaries, NULL);
 
@@ -1604,25 +1604,25 @@ The runtime system assigns the work in the command queues to the underlying devi
 ============================= ======================================================
 OpenCL API Function       	Description
 ============================= ======================================================
-clCreateCommandQueueWith       Create a command queue for a specific device 
+clCreateCommandQueueWith       Create a command queue for a specific device
 Properties (in OpenCL 2.0)     (CPU,GPU.)
-clCreateCommandQueue() 
-(in OpenCL 1.x; deprecated 
-in OpenCL 2.0) 
+clCreateCommandQueue()
+(in OpenCL 1.x; deprecated
+in OpenCL 2.0)
 
-clCreateKernel()	       Creates a kernel object from the program object. 
+clCreateKernel()	       Creates a kernel object from the program object.
 
 clCreateBuffer()	       Creates a buffer object for use via OpenCL kernels.
 
-clSetKernelArg()	       Set the kernel arguments, and enqueue the kernel in a 
+clSetKernelArg()	       Set the kernel arguments, and enqueue the kernel in a
 clEnqueueNDRangeKernel()       command queue.
 
-clEnqueueReadBuffer(), 	       Enqueue a command in a command queue to read from a 
-clEnqueueWriteBuffer()	       buffer object to host memory, or write to the buffer 
+clEnqueueReadBuffer(), 	       Enqueue a command in a command queue to read from a
+clEnqueueWriteBuffer()	       buffer object to host memory, or write to the buffer
 			       object from host memory
 
 clEnqueueWaitForEvents()	Wait for the specified events to complete.
-============================= ====================================================== 
+============================= ======================================================
 
 
 The commands can be broadly classified into three categories.
@@ -1644,7 +1644,7 @@ Running the Program
 
 Creating Kernel Objects
 ***********************
-After a program is created and built, the next step is to run the kernel code on the devices. Running the kernel code requires the creation of one or more kernel objects for each kernel function (declared as “   kernel” or “kernel”). Kernel objects are run-time objects that bind the specific kernel function with the argument values to be used while executing it.
+After a program is created and built, the next step is to run the kernel code on the devices. Running the kernel code requires the creation of one or more kernel objects for each kernel function (declared as "   kernel" or "kernel"). Kernel objects are run-time objects that bind the specific kernel function with the argument values to be used while executing it.
 
 The clCreateKernel API creates a kernel object from a program object by using the name of the kernel function passed with program object. The arguments to kernel objects are set by the following APIs:
 
@@ -1657,10 +1657,10 @@ SVM pointers as the argument value.
 
 A sample kernel definition is shown below.
 ::
-  
+
   kernel void sample_kernel( global const uchar *normalPtr, global uchar *svmPtr)
-  {  
-    …
+  {
+    ...
   }
 
 To create a kernel object for the above kernel, you must pass the program object corresponding to the kernel to the clCreateKernel function. Assuming that the program object containing the above kernel function has been created and built as program, a kernel object for the above kernel would be created as follows:
@@ -1700,7 +1700,7 @@ A command queue (host or device) is created by using the clCreateCommandQueueWit
 
 **Example: To create a default device-side out-of-order command queue with a specific size**
 ::
-  
+
    cl_queue_properties prop[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT, CL_QUEUE_SIZE, maxQueueSize, 0 };
 
    cl_command_queue commandQueue = clCreateCommandQueueWithProperties(context, deviceId, props, &status);
@@ -1709,7 +1709,7 @@ Running a Kernel (from the host)
 *********************************
 After a command queue has been created, the queue can be used to en-queue the commands to the associated device. The clEnqueueNDRangeKernel API en-queues a command to execute a kernel to a device. During the kernel en- queue, one must specify the total number of kernel instances or work-items to be executed by the device and the size of each work-group or block. This information is set by the work_dim, global_work_size, local_work_size and global_work_offset arguments. Like any other command en-queuing API, the clEnqueueNDRangeKernel returns an event object that conveys information about the en-queued kernel and can be used to synchronization other commands dependent on this kernel. In this API, a list of events that need to complete before this particular command can be executed can be specified.
 
-For example, suppose a kernel object and command queue, named “kernel” and “commandQueue” respectively, have already been created. Suppose you want to launch the kernel over a 2-D dimensional space having total work-items
+For example, suppose a kernel object and command queue, named "kernel" and "commandQueue" respectively, have already been created. Suppose you want to launch the kernel over a 2-D dimensional space having total work-items
 {1024x1024} and each block/group size {16x16}. To do this, the kernel can be en-queued into the command queue as follows:
 
  | cl_uint workDim = 2;
@@ -1740,7 +1740,7 @@ For GPU processing, the OpenCL compiler generates an intermediate representation
 
 Profiling OpenCL
 ==============================
-This chapter discusses how to profile OpenCL programs running on AMD GPU and CPU compute devices. The preferred method is to debug with AMD CodeXL, as described in  “AMD CodeXL GPU Debugger.” The second method, described in  “Debugging CPU Kernels with GDB,” is to use experimental features provided by ROCm (GNU project debugger, GDB) to debug kernels on x86 CPUs running Linux.
+This chapter discusses how to profile OpenCL programs running on AMD GPU and CPU compute devices. The preferred method is to debug with AMD CodeXL, as described in  "AMD CodeXL GPU Debugger." The second method, described in  "Debugging CPU Kernels with GDB," is to use experimental features provided by ROCm (GNU project debugger, GDB) to debug kernels on x86 CPUs running Linux.
 
 .. _AMD-CodeXL-GPU:
 
@@ -1749,7 +1749,7 @@ Downloading and installing CodeXL and Radeon Compute Profiler
 Download the latest version of CodeXL from the CodeXL home page:
 http://developer.amd.com/tools-and-sdks/opencl-zone/codexl/
 
-Radeon Compute Profiler is a performance analysis tool that gathers data from the API run-time and GPU for OpenCL™ and ROCm/HSA applications
+Radeon Compute Profiler is a performance analysis tool that gathers data from the API run-time and GPU for OpenCL(TM) and ROCm/HSA applications
 
 RCP is installed when you you use rocm-dev upon instal of the driver.  You can access the source code at https://github.com/GPUOpen-Tools/RCP
 
@@ -1759,20 +1759,20 @@ Either install the tar archive, or install the .deb package.
 
 **Tar archive:**
 
-1. Download the AMD_CodeXL_Linux*.tar.gz 64-bit Linux tar package at https://github.com/GPUOpen-Tools/CodeXL/releases 
+1. Download the AMD_CodeXL_Linux*.tar.gz 64-bit Linux tar package at https://github.com/GPUOpen-Tools/CodeXL/releases
 
 2. Run:
-   $ tar –xvzf CodeXL_Linux*.tar.gz
+   $ tar -xvzf CodeXL_Linux*.tar.gz
 
 **Debian package :**
 
 1. Download the ``amdcodexl-*.deb 64-bit Linux Debian package.``
 
 2. Run: ``$ sudo dpkg -i amdcodexl_x.x.x-1_amd64.deb ``
-   
+
 3. Run: ``$ sudo apt-get -f install``
 
-Or build the project from source code https://github.com/GPUOpen-Tools/CodeXL 
+Or build the project from source code https://github.com/GPUOpen-Tools/CodeXL
 
 Using CodeXL for profiling
 ###########################
@@ -1784,7 +1784,7 @@ Two modes in CodeXL are particularly useful for profiling:
 
 GPU Profile Mode
 *****************
-The GPU Profile Mode helps developers analyze and profile OpenCL™ host and device code. Developers can profile the entire application or only the kernels by using one of the following modes:
+The GPU Profile Mode helps developers analyze and profile OpenCL(TM) host and device code. Developers can profile the entire application or only the kernels by using one of the following modes:
 
  * Entire application profile: Collect application trace mode
  * Kernel profile: Collect GPU performance counter mode
@@ -1803,13 +1803,13 @@ While running your application in the GPU Profile mode, CodeXL collects valuable
 
  * **Timeline visualization:** Visualize host and device execution in a timeline chart
 
-   View number of OpenCL™ contexts and command queues created and the relationships between these items
+   View number of OpenCL(TM) contexts and command queues created and the relationships between these items
 
-   View data transfer operations and kernel executions on the device 
+   View data transfer operations and kernel executions on the device
 
    Determine proper synchronization and load balancing
- 
-   
+
+
   .. image:: images/4.3.png
       :align: center
 
@@ -1818,30 +1818,30 @@ While running your application in the GPU Profile mode, CodeXL collects valuable
     Includes a helpful list of best practices
 
     Includes recommendations to improve program performance
- 
+
  *  **Summary pages:** Find top bottlenecks
 
     I/O bound
 
     Compute bound
-  
- 
+
+
  .. image:: images/4.4.png
     :align: center
 
- * **Kernel occupancy:** Estimate OpenCL™ kernel occupancy for AMD APUs and GPUs
+ * **Kernel occupancy:** Estimate OpenCL(TM) kernel occupancy for AMD APUs and GPUs
 
   Visual indication of the limiting kernel resources for number of wavefronts in flight
 
   View the maximum number of wavefronts in flight limited by
 
-  –Work group size
+  -Work group size
 
-  –Number of allocated scalar or vector registers
+  -Number of allocated scalar or vector registers
 
-  –Amount of allocated LDS
+  -Amount of allocated LDS
 
-  –View the maximum resource limit for the GPU device
+  -View the maximum resource limit for the GPU device
 
 
  .. image:: images/4.5.png
@@ -1859,9 +1859,9 @@ The Analyze Mode provides a nice way to begin writing your kernel and to compile
 
 The Analyze Mode allows a user to do the following:
 
-* **Edit your OpenCL™ kernel inside CodeXL editor**
+* **Edit your OpenCL(TM) kernel inside CodeXL editor**
    Create a new file
-   Drag and drop an existing OpenCL™ kernel file
+   Drag and drop an existing OpenCL(TM) kernel file
 * **Highlight keywords**
    The CodeXL editor highlights keywords for easier editing
 
@@ -1874,9 +1874,9 @@ The Analyze Mode allows a user to do the following:
 
 *  Choose your target device
    The Analyze Mode enables to compile to any supported device target, without the need to install the device
-*  Fix OpenCL™ compiler errors and warnings in which the kernel file is the only input
+*  Fix OpenCL(TM) compiler errors and warnings in which the kernel file is the only input
    View OpenCL compilation errors and fix immediately.
-*  Edit OpenCL™ Compiler options with an easy options tab
+*  Edit OpenCL(TM) Compiler options with an easy options tab
    CodeXL summarizes all the OpenCL options so that it is easy to use them.
 
 
@@ -1916,15 +1916,15 @@ The following list contains the major static C++ features supported by this exte
 
  * Kernel and function overloading.
  * Inheritance:
-    | – Strict inheritance.
-    | – Friend classes.
-    | – Multiple inheritance.
+    | - Strict inheritance.
+    | - Friend classes.
+    | - Multiple inheritance.
  * Templates:
-    | –Kernel templates.
-    | –Member templates.
-    | –Template default argument.
-    | –Limited class templates (the virtual. keyword is not exposed).
-    | –Partial template specialization
+    | -Kernel templates.
+    | -Member templates.
+    | -Template default argument.
+    | -Limited class templates (the virtual. keyword is not exposed).
+    | -Partial template specialization
  * Namespaces.
  * References.
  * this operator.
@@ -1946,7 +1946,7 @@ Static C++ features not supported by this extension are:
 * The language specified in this extension can be easily expanded to support these features.
 
 Relations with ISO/IEC C++
-*************************** 
+***************************
 
 This extension focuses on documenting the differences between the OpenCL Static C++ kernel language and the ISO/IEC Programming languages C++ specification. Where possible, this extension leaves technical definitions to the ISO/IEC specification.
 
@@ -1983,7 +1983,7 @@ To compile a program that contains static C++ kernels and functions, the applica
 
 where language is defined as one of the following:
 
- * clc – the source language is considered to be OpenCL C, as defined in the
+ * clc - the source language is considered to be OpenCL C, as defined in the
    The OpenCL Programming Language version 1.21.
  * clc++ - the source language is considered to be OpenCL C++, as defined in the following sections of the this document.
 
@@ -2036,7 +2036,7 @@ As per of the static C++ language specification, a number of restrictions limit
 
 Also, the rules for well-formed programs as defined by Section 13 of the static C++ language specification are lifted to apply to both kernel and function declarations.
 
-The overloading resolution is per Section 13.1 of the static C++ language specification, but extended to account for vector types. The algorithm for “best viable function”, Section 13.3.3 of the static C++ language specification, is extended for vector types by inducing a partial-ordering as a function of the partial-ordering of its elements. Following the existing rules for vector types in the OpenCL 1.2 specification, explicit conversion between vectors is not allowed. (This reduces the number of possible overloaded functions with respect to vectors, but this is not expected to be a particular burden to developers because explicit conversion can always be applied at the point of function evocation.)
+The overloading resolution is per Section 13.1 of the static C++ language specification, but extended to account for vector types. The algorithm for "best viable function", Section 13.3.3 of the static C++ language specification, is extended for vector types by inducing a partial-ordering as a function of the partial-ordering of its elements. Following the existing rules for vector types in the OpenCL 1.2 specification, explicit conversion between vectors is not allowed. (This reduces the number of possible overloaded functions with respect to vectors, but this is not expected to be a particular burden to developers because explicit conversion can always be applied at the point of function evocation.)
 
 For overloaded kernels, the following syntax is used as part of the kernel name:
 
@@ -2103,7 +2103,7 @@ Examples
 
 Passing a Class from the Host to the Device and Back
 ******************************************************
-The class definition must be the same on the host code and the device code, besides the members’ type in the case of vectors. If the class includes vector data types, the definition must conform to the table that appears on Section 6.1.2
+The class definition must be the same on the host code and the device code, besides the members' type in the case of vectors. If the class includes vector data types, the definition must conform to the table that appears on Section 6.1.2
 
 
 of the OpenCL Programming Specification 1.2, Corresponding API type for
@@ -2136,10 +2136,10 @@ OpenCL Language types.
   int x;
   }
 
-  MyFunc () 
+  MyFunc ()
   {
     tempClass = new(Test);
-    ... // Some OpenCL startup code – create context, queue, etc.
+    ... // Some OpenCL startup code - create context, queue, etc.
     cl_mem classObj = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(Test), &tempClass, event);
     clEnqueueMapBuffer(...,classObj,...);
     tempClass.setX(10);
@@ -2147,17 +2147,17 @@ OpenCL Language types.
     clEnqueueNDRange(..., fooKernel, ...);
     clEnqueueMapBuffer(...,classObj,...); //class is passed back to the Host
   }
- 
+
 
 Kernel Overloading
 *******************
 This example shows how to define and use mangled_name for kernel overloading, and how to choose the right kernel from the host code. Assume the following kernels are defined:
 
-:: 
+::
 
   __attribute__((mangled_name(testAddFloat4))) kernel void
   testAdd(global float4 * src1, global float4 * src2, global float4 * dst)
-  { 
+  {
     int tid = get_global_id(0);
     dst[tid] = src1[tid] + src2[tid];
   }
@@ -2215,7 +2215,7 @@ OpenCL 2.0 and 2.1 features are provided with the ROCm 2.4 OpenCL Language Runti
 
 For guidelines on how to migrate from OpenCL 1.2 to OpenCL 2.1 and for information about querying for image- and device-specific extensions, see Portability considerations.
 
-For a list of the new and deprecated functions,  “New and deprecated functions in OpenCL 2.0.”
+For a list of the new and deprecated functions,  "New and deprecated functions in OpenCL 2.0."
 
 .. _Shared-virtual-Memory:
 
@@ -2234,7 +2234,7 @@ Support for SVM does not imply or require that the host and the OpenCL devices i
 
 A caveat, however, concerns situations in which the host and the OpenCL devices access the same region of memory at the same time. It would be highly inefficient for the host and the OpenCL devices to have a consistent view of the memory for each load/store from any device/host. In general, the memory model of the language or architecture implementation determines how or when a memory location written by one thread or agent is visible to another. The memory model also determines to what extent the programmer can control the scope of such accesses.
 
-OpenCL 2.0 adopts the memory model defined in C++11 with some extensions. The memory orders taken from C++11 are: "relaxed", "acquire", "release", “acquire-release”, and "sequential consistent".
+OpenCL 2.0 adopts the memory model defined in C++11 with some extensions. The memory orders taken from C++11 are: "relaxed", "acquire", "release", "acquire-release", and "sequential consistent".
 
 OpenCL 2.0 introduces a new (C++11-based) set of atomic operations with specific memory-model based semantics. Atomic operations are indivisible: a thread or agent cannot see partial results. The atomic operations supported are:
 
@@ -2255,7 +2255,7 @@ OpenCL 2.0 introduces the concept of "memory scope", which limits the extent to
 OpenCL 2.0 further differentiates between coarse-grained SVM buffer sharing and fine-grained SVM (buffer and system) sharing mechanisms. These mechanisms define the granularity at which the SVM buffers are shared.
 
 Updates to coarse-grained or fine-grained SVM are visible to other devices at synchronization points:
- 
+
  * For coarse-grained SVM, the synchronization points are: the mapping or un- mapping of the SVM memory and kernel launch or completion. This means that any updates are visible only at the end of the kernel or at the point of un-mapping the region of     	memory.
    Coarse-grained buffer memory has a fixed virtual address for all the devices it is allocated on. In the AMD implementation, the   	physical memory is allocated on Device Memory.
 
@@ -2292,7 +2292,7 @@ Some applications do not require fine-grained atomics to ensure that the SVM is
 
 For example, while searching in parallel on a binary search tree , coarse-grain buffers are usually sufficient. In general, coarse-grain buffers provide faster access compared to fine grain buffers as the memory is not required to be consistent across devices.
 
-:: 
+::
 
   for (i = 0; i < keys_per_wi; i++) {
   key = search_keys[init_id + i]; tmp_node = root;
@@ -2316,10 +2316,10 @@ The host creates two buffers, svmTreeBuf and svmSearchBuf, to hold the given tre
 The next task is to create the tree and populate the svmTreeBuf using ``clSVMEnqueueMap`` and ``clSVMEnqueueUnmap``. The host-code method, cpuCreateBinaryTree, illustrates this mechanism; note the calls to these map/unmap APIs.
 
 The host then creates the keys to be searched in svmSearchBuf, as the cpuInitSearchKeys method illustrates. Next, it enqueues the kernel to search the binary tree for the given keys in the svmSearchBuf, and it sets the parameters to the kernel using clSetKernelArgSVMPointer:
-:: 
+::
 
  int status = clSetKernelArgSVMPointer(sample_kernel, 0, (void *)(svmTreeBuf));
- 
+
  status = clSetKernelArgSVMPointer(sample_kernel, 1, (void *)(svmSearchBuf));
 
 Note that the routine passes both svmTreeBuf and svmSearchBuf to the kernel as parameters. The following node structure demonstrates how to create the tree on the host using pointers to the left and right children:
@@ -2363,7 +2363,7 @@ Updates to the tree occur on the host (CPU) or on the GPU, but not on both simul
 
 Because the tree is created on the host, and because OpenCL 1.2 disallows SVM, implementing these steps is difficult in OpenCL 1.2. In OpenCL 1.2, you must store the tree as arrays, copy the arrays to the GPU memory (specifying the appropriate offsets), and then copy the arrays back to the host.
 
-The “data” is the tree created by the host as a coarse-grain buffer and is passed to the kernel as an input pointer.
+The "data" is the tree created by the host as a coarse-grain buffer and is passed to the kernel as an input pointer.
 
 .. image:: images/6.1.png
     :align: center
@@ -2392,15 +2392,15 @@ Generic example
 ****************
 
 In OpenCL 1.2, the developer needed to write three functions for a pointer p that can reference the local, private, or global address space::
-  
-  void fooL (local int *p) { … } 
-  void fooP (private int *p) { … }
-  void fooG (global int *p) { … }
- 
+
+  void fooL (local int *p) { ... }
+  void fooP (private int *p) { ... }
+  void fooG (global int *p) { ... }
+
 
 
 In OpenCL 2.0, the developer needs to write only one function::
- 
+
  void foo (int *p)
 
 As foo is a generic function, the compiler will accept calls to it with pointers to any address space except the constant address space.
@@ -2421,7 +2421,7 @@ OpenCL sample, addMul2d is a generic function that uses generic address spaces f
 ::
 
   float4 addMul2D (uchar4 *src, float *filter, int2 filterDim, int width)
-  {	
+  {
     int i, j;
     float4 sum = (float4)(0);
     for(i = 0; i < (filterDim.y); i++)
@@ -2450,7 +2450,7 @@ OpenCL 2.0 allows kernels to enqueue other kernels. It provides a new construct,
 
 kernels. In addition, OpenCL 2.0 deprecates the run-time API call ``clCreateCommandQueue``, in favor of a new call, ``clCreateCommandQueueWithProperties``, that can create device-side command queues.
 
-Because it eliminates the overhead of returning kernel-launch control to the host, device-side enqueue can in many cases improve application performance. Some platforms (such as AMD’s) provide a standard way of enqueuing work to the hardware, which can further improve the performance. Device-side enqueue has been observed to reduce by the overhead of enqueuing by more than 3x in some cases.
+Because it eliminates the overhead of returning kernel-launch control to the host, device-side enqueue can in many cases improve application performance. Some platforms (such as AMD's) provide a standard way of enqueuing work to the hardware, which can further improve the performance. Device-side enqueue has been observed to reduce by the overhead of enqueuing by more than 3x in some cases.
 
 Applications that are inherently recursive or that require additional processing can derive particular benefit. A classic example of the latter case is a tree search that discovers new nodes when traversing from the root to the leaves.
 
@@ -2458,19 +2458,19 @@ Device enqueue is also useful in determining when all the workgroups of the pare
 
 Workgroup/subgroup-level functions
 ***********************************
-OpenCL 2.0 introduces new built-in functions that operate at the workgroup or subgroup level. (A workgroup comprises one or more subgroups; the vendor handles the exact subgroup implementation.) For example, on AMD platforms, a subgroup maps to a “wavefront”. (For details, see the AMD OpenCL User Guide.)
+OpenCL 2.0 introduces new built-in functions that operate at the workgroup or subgroup level. (A workgroup comprises one or more subgroups; the vendor handles the exact subgroup implementation.) For example, on AMD platforms, a subgroup maps to a "wavefront". (For details, see the AMD OpenCL User Guide.)
 
 Basically, a wavefront is an execution unit on the GPU. The OpenCL specification requires that all work items in a workgroup/subgroup executing the kernel handle these new functions; otherwise, their results may be undefined.
 
 OpenCL 2.0 defines the following new built-in functions. Note that it also defines similar functions for subgroups under the cl_khr_subgroups extensions in CL_DEVICE_EXTENSIONS.
 
-1. work_group_all and work_group_any: These functions test a given predicate on all work items in the workgroup. The “all” version effectively performs an AND operation on all predicates and returns the result to all work items; similarly, the “any” operation performs an OR operation. Thus, using the “all” function returns true if the predicate is true for all work items; “any” returns true if it is true for at least one work item.
+1. work_group_all and work_group_any: These functions test a given predicate on all work items in the workgroup. The "all" version effectively performs an AND operation on all predicates and returns the result to all work items; similarly, the "any" operation performs an OR operation. Thus, using the "all" function returns true if the predicate is true for all work items; "any" returns true if it is true for at least one work item.
 
 2. work_group_broadcast: This function broadcasts a local value from each work item to all the others in the workgroup.
 
 3. work_group_reduce: Given an operation, work_group_reduce performs the reduction operation on all work items and returns the result. The operation can be min, max or add. For example, when called for an array using the add operation, the function returns the sum of the array elements.
 
-4. work_group_inclusive/exclusive_scan: The “scan” operation is a prefix operation, which performs a reduction up to the work-item ID. If it includes the current ID, the function applies an inclusive scan; otherwise, if it covers everything up to but not including the current work item, it applies an exclusive scan. Again, the operation can be min, max or add.
+4. work_group_inclusive/exclusive_scan: The "scan" operation is a prefix operation, which performs a reduction up to the work-item ID. If it includes the current ID, the function applies an inclusive scan; otherwise, if it covers everything up to but not including the current work item, it applies an exclusive scan. Again, the operation can be min, max or add.
 
 OpenCL 2.0 introduces a Khronos sub-group extension. Sub-groups are a logical abstraction of the hardware SIMD execution model akin to wavefronts, warps, or vectors and permit programming closer to the hardware in a vendor-independent manner.  This extension includes a set of cross-sub-group built-in functions that
 match the set of the cross-work-group built-in functions specified above.
@@ -2547,7 +2547,7 @@ The kernel is rewritten in OpenCL 2.0 to enqueue itself. (For full details, see
 
 Finally, the kernel launches itself again using device enqueue, but with new bounds:
 
-:: 
+::
 
   void (^binarySearch_device_enqueue_wrapper_blk)(void) =
    ^{binarySearch_device_enqueue_multiKeys_child(outputArray,
@@ -2561,12 +2561,12 @@ Finally, the kernel launches itself again using device enqueue, but with new bou
   int err_ret = enqueue_kernel(defQ,CLK_ENQUEUE_FLAGS_WAIT_KERNEL,ndrange1,binarySe arch_device_enqueue_wrapper_blk);
 
 It also checks for missing keys; absent any such keys, the search stops by forgoing further enqueues::
- 
- /**** Search continues only if at least one key is found in previous search ****/ 
+
+ /**** Search continues only if at least one key is found in previous search ****/
  int Flag = atomic_load_explicit(&,memory_order_seq_cst);
  if(Flag == 0)
-   return; 
- 
+   return;
+
 
 The advantage is that when the input array is large, the OpenCL 2.0 version divides the input array into 1024-sized chunks. The chunk in which the given key falls is found and another kernel is enqueued which further divides it into 1024- sized chunks, and so on. In OpenCL 1.2, as the whole array is taken as the NDRange, a huge number of work groups require processing.
 
@@ -2604,9 +2604,9 @@ Atomic Loads/Stores
 This sample illustrates atomic loads/stores with the use of memory orders.
 
 The first step is to create this memory on the host::
-  
+
   buffer = (int * ) clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER, (N+1)*sizeof(int), 4);
-  
+
   atomicBuffer = (int * ) clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, (N+1)*sizeof(int), 4);
 
 
@@ -2633,7 +2633,7 @@ The kernel next stores (100+i), where i is the ID of the work-item into atomicBu
 After the atomic operation, the updates on fine-grain variables (such as buffer) will also be available at the host. The CPU checks for the following to ensure that the results are OK:
 
 ::
- 
+
   for (i=0;i<N;i++)
     while(std::atomic_load_explicit ((std::atomic<int>*)&atomicBuffer[i], std::memory_order_acquire) != (100+i));
     /* check the results now */
@@ -2728,12 +2728,12 @@ Pipe.
 The memory allocated in the above function can be passed to kernels as read- only or write-only pipes. The pipe objects can only be passed as kernel arguments or kernel functions and cannot be declared inside a kernel or as program-scoped objects.
 
 Also, a set of built-in functions have been added to operate on the pipes. The important ones are:
- 
+
 read_pipe (pipe p, gentype * ptr: for reading packet from pipe p into ptr.
- 
+
 write_pipe (pipe p, gentype * ptr: for writing packet pointed to by ptr to pipe p.
 
-To ensure you have enough space in the pipe structure for reading and writing (before you actually do it), you can use built-in functions to “reserve” enough space. For example, you could reserve room by calling reserve_read_pipe or reserve_write_pipe. These functions return a reservation ID, which can be used when the actual operations are performed. Similarly, the standard has built-in functions for workgroup level reservations, such as work_group_reserve_read_pipe and work_group_reserve_write_pipe and for the workgroup order (in the program). These workgroup built-in functions operate at the workgroup level. Ordering across workgroups is undefined. Calls to commit_read_pipe and commit_write_pipe, as the names suggest, commit the actual operations (read/write).
+To ensure you have enough space in the pipe structure for reading and writing (before you actually do it), you can use built-in functions to "reserve" enough space. For example, you could reserve room by calling reserve_read_pipe or reserve_write_pipe. These functions return a reservation ID, which can be used when the actual operations are performed. Similarly, the standard has built-in functions for workgroup level reservations, such as work_group_reserve_read_pipe and work_group_reserve_write_pipe and for the workgroup order (in the program). These workgroup built-in functions operate at the workgroup level. Ordering across workgroups is undefined. Calls to commit_read_pipe and commit_write_pipe, as the names suggest, commit the actual operations (read/write).
 
 Usage
 ******
@@ -2748,8 +2748,8 @@ The host creates the pipe, which both kernels will use, as follows:
   &status);
 
 This code makes a pipe that the program kernels can access (read/write). The host creates two kernels, producer_kernel and consumer_kernel. The producer kernel first reserves enough space for the write pipe::
- 
- //reserve space in pipe for writing random numbers. 
+
+ //reserve space in pipe for writing random numbers.
  reserve_id_t rid = work_group_reserve_write_pipe(rng_pipe, szgr);
 
 Next, the kernel writes and commits to the pipe by invoking the following functions:
@@ -2760,14 +2760,14 @@ Next, the kernel writes and commits to the pipe by invoking the following functi
   //reserve pipe for reading
   reserve_id_t rid = work_group_reserve_read_pipe(rng_pipe, szgr);
   if(is_valid_reserve_id(rid)) {
-  //read random number from the pipe. read_pipe(rng_pipe,rid,lid, &rn); work_group_commit_read_pipe(rng_pipe, rid); 
+  //read random number from the pipe. read_pipe(rng_pipe,rid,lid, &rn); work_group_commit_read_pipe(rng_pipe, rid);
   }
 
 The consumer_kernel then uses this set of random number and constructs the histogram. The CPU creates the same histogram and verifies whether the histogram created by the kernel is correct. Here, lid is the local id of the work item, obtained by get_local_id(0).
 
 The example code demonstrates how you can use a pipe as a convenient data structure that allows two kernels to communicate.
 
-In OpenCL 1.2, this kind of communication typically involves the host – although kernels can communicate without returning control to the host. Pipes, however, ease programming by reducing the amount of code that some applications require.
+In OpenCL 1.2, this kind of communication typically involves the host - although kernels can communicate without returning control to the host. Pipes, however, ease programming by reducing the amount of code that some applications require.
 
 .. _Program-scope-global-Variables:
 
@@ -2815,7 +2815,7 @@ Creating sRGB image objects is similar to creating an image object of existing s
 
 ::
 
-  cl_image_format imageFormat; 
+  cl_image_format imageFormat;
   imageFormat.image_channel_data_type = CL_UNORM_INT8;
   imageFormat.image_channel_order = CL_sRGBA
   cl_mem imageObj = clCreateImage(
@@ -2836,7 +2836,7 @@ The following is a kernel sample that illustrates how to read an sRGB image obje
 
 ::
 
-  // Read sRGBA image object (input) and convert it to linear RGB 
+  // Read sRGBA image object (input) and convert it to linear RGB
   values(results)
   kernel void sample_kernel( read_only image2d_t input, sampler_t imageSampler,	  global float *xOffsets,  global float *yOffsets,
   global float4 *results	)	// input: sRGBA image object
@@ -2939,7 +2939,7 @@ The name of extension is standardized and must contain the following elements wi
 
  * cl_khr_<extension_name> - for extensions approved by Khronos Group. For example: ``cl_khr_fp64``
  * cl_ext_<extension_name> - for extensions provided collectively by multiple vendors. For example: ``cl_ext_device_fission``
- * cl_<vendor_name>_<extension_name> – for extension provided by a specific vendor. For example: ``cl_amd_media_ops``
+ * cl_<vendor_name>_<extension_name> - for extension provided by a specific vendor. For example: ``cl_amd_media_ops``
 
 The OpenCL Specification states that all API functions of the extension must have names in the form of cl<FunctionName>KHR, cl<FunctionName>EXT, or cl<FunctionName><VendorName>. All enumerated values must be in the form of CL_<enum_name>_KHR, CL_<enum_name>_EXT, or CL_<enum_name>_<VendorName>.
 
@@ -2967,17 +2967,17 @@ There are special directives for the OpenCL compiler to enable or disable availa
  #pragma OPENCL EXTENSION all: <behavior>
 
 
-The <extension_name> is described in Section A.1, “Extension Name
-Convention.”. The second form allows to address all extensions at once. The <behavior> token can be either:
+The <extension_name> is described in Section A.1, "Extension Name
+Convention.". The second form allows to address all extensions at once. The <behavior> token can be either:
 
-* **enable** - the extension is enabled if it is supported, or the error is reported if the specified extension is not supported or token “all” is used.
+* **enable** - the extension is enabled if it is supported, or the error is reported if the specified extension is not supported or token "all" is used.
 * **disable** - the OpenCL implementation/compiler behaves as if the specified extension does not exist.
 * **all** - only core functionality of OpenCL is used and supported, all extensions are ignored. If the specified extension is not supported then a warning is issued by the compiler.
 
 The order of directives in #pragma OPENCL EXTENSION is important: a later directive with the same extension name overrides any previous one.
 
 The initial state of the compiler is set to ignore all extensions as if it was explicitly set with the following directive::
- 
+
  #pragma OPENCL EXTENSION all : disable
 
 This means that the extensions must be explicitly enabled to be used in kernel programs.
@@ -2998,7 +2998,7 @@ Use the following function to get an extension function pointer.
 
 This returns the address of the extension function specified by the FunctionName string. The returned value must be appropriately cast to a function pointer type, specified in the extension spec and header file.
 
-A return value of NULL means that the specified function does not exist in the CL implementation. A non-NULL return value does not guarantee that the extension function actually exists – queries described in sec. 2 or 3 must be done to ensure the extension is supported.
+A return value of NULL means that the specified function does not exist in the CL implementation. A non-NULL return value does not guarantee that the extension function actually exists - queries described in sec. 2 or 3 must be done to ensure the extension is supported.
 
 The ``clGetExtensionFunctionAddress()`` function cannot be used to get core API function addresses.
 
@@ -3007,16 +3007,16 @@ List of Supported Extensions that are Khronos-Approved
 For a complete list of the supported extensions, see the OpenCL 1.2 and
 OpenCL 2.0 specification documents. The typical extensions in OpenCL 1.2 are:
 
-* cl_khr_global_int32_base_atomics – basic atomic operations on 32-bit integers in global memory.
-* cl_khr_global_int32_extended_atomics – extended atomic operations on 32-bit integers in global memory.
-* cl_khr_local_int32_base_atomics – basic atomic operations on 32-bit integers in local memory.
-* cl_khr_local_int32_extended_atomics – extended atomic operations on 32-bit integers in local memory.
-* cl_khr_int64_base_atomics – basic atomic operations on 64-bit integers in both global and local memory.
-* cl_khr_int64_extended_atomics – extended atomic operations on 64-bit integers in both global and local memory.
-* cl_khr_3d_image_writes – supports kernel writes to 3D images.
-* cl_khr_byte_addressable_store – this eliminates the restriction of not allowing writes to a pointer (or array elements) of types    	less than 32-bit wide in kernel program.
-* cl_khr_gl_sharing – allows association of OpenGL context or share group with CL context for interoperability.
-* cl_khr_icd – the OpenCL Installable Client Driver (ICD) that lets developers select from multiple OpenCL runtimes which may be      	installed on a system.
+* cl_khr_global_int32_base_atomics - basic atomic operations on 32-bit integers in global memory.
+* cl_khr_global_int32_extended_atomics - extended atomic operations on 32-bit integers in global memory.
+* cl_khr_local_int32_base_atomics - basic atomic operations on 32-bit integers in local memory.
+* cl_khr_local_int32_extended_atomics - extended atomic operations on 32-bit integers in local memory.
+* cl_khr_int64_base_atomics - basic atomic operations on 64-bit integers in both global and local memory.
+* cl_khr_int64_extended_atomics - extended atomic operations on 64-bit integers in both global and local memory.
+* cl_khr_3d_image_writes - supports kernel writes to 3D images.
+* cl_khr_byte_addressable_store - this eliminates the restriction of not allowing writes to a pointer (or array elements) of types    	less than 32-bit wide in kernel program.
+* cl_khr_gl_sharing - allows association of OpenGL context or share group with CL context for interoperability.
+* cl_khr_icd - the OpenCL Installable Client Driver (ICD) that lets developers select from multiple OpenCL runtimes which may be      	installed on a system.
 * cl_khr_d3d10_sharing - allows association of D3D10 context or share group with CL context for interoperability.
 * cl_dx9_media_sharing
 * Cl_khr_fp16
@@ -3048,7 +3048,7 @@ The typical extensions in OpenCL 2.0 are:
 
 cl_ext Extensions
 **********************
-* cl_ext_device_fission - Support for device fission in OpenCL™. For more information about this extension, see: http://www.khronos.org/registry/cl/extensions/ext/cl_ext_device_fission.txt
+* cl_ext_device_fission - Support for device fission in OpenCL(TM). For more information about this extension, see: http://www.khronos.org/registry/cl/extensions/ext/cl_ext_device_fission.txt
 * cl_ext_atomic_counters_32 - Support for 32-bit atomic counters. For more information about this extension, see: https://www.khronos.org/registry/cl/extensions/ext/cl_ext_atomic_counters_32.txt
 
 
@@ -3060,7 +3060,7 @@ This section describes the AMD vendor-specific extensions.
 
 cl_amd_fp64
 ***************
-Before using double data types, double-precision floating point operators, and/or double-precision floating point routines in OpenCL™ C kernels, include the
+Before using double data types, double-precision floating point operators, and/or double-precision floating point routines in OpenCL(TM) C kernels, include the
 #pragma OPENCL EXTENSION cl_amd_fp64 : enable directive. See Table A.1 for a list of supported routines.
 
 cl_amd_vec3
@@ -3109,23 +3109,23 @@ cl_amd_compile_options
 ***********************
 This extension adds the following options, which are not part of the OpenCL specification.
 
-* -g — This is an experimental feature that lets you use the GNU project debugger, GDB, to debug kernels on x86 CPUs running Linux or cygwin/minGW under Windows. For more details, see Chapter 4, “Debugging and Profiling OpenCL.” This option does not affect the default optimization of the OpenCL code.
-* -O0 — Specifies to the compiler not to optimize. This is equivalent to the
+* -g -- This is an experimental feature that lets you use the GNU project debugger, GDB, to debug kernels on x86 CPUs running Linux or cygwin/minGW under Windows. For more details, see Chapter 4, "Debugging and Profiling OpenCL." This option does not affect the default optimization of the OpenCL code.
+* -O0 -- Specifies to the compiler not to optimize. This is equivalent to the
   OpenCL standard option -cl-opt-disable.
-* -f[no-]bin-source — Does [not] generate OpenCL source in the .source section. For more information, see Appendix C, “OpenCL Binary Image Format (BIF) v2.0.” By default, the source is NOT generated.
-* -f[no-]bin-llvmir — Does [not] generate LLVM IR in the .llvmir section.
-  For more information, see Appendix C, “OpenCL Binary Image Format (BIF)
-  v2.0.” By default, LLVM IR IS generated.
-* -f[no-]bin-amdil — Does [not] generate AMD IL in the .amdil section. For more information, see Appendix C, “OpenCL Binary Image Format (BIF) v2.0.” By Default, AMD IL is NOT generated.
-* -f[no-]bin-exe — Does [not] generate the executable (ISA) in .text section.
-  For more information, see Appendix C, “OpenCL Binary Image Format (BIF)
-  v2.0.” By default, the executable IS generated.
+* -f[no-]bin-source -- Does [not] generate OpenCL source in the .source section. For more information, see Appendix C, "OpenCL Binary Image Format (BIF) v2.0." By default, the source is NOT generated.
+* -f[no-]bin-llvmir -- Does [not] generate LLVM IR in the .llvmir section.
+  For more information, see Appendix C, "OpenCL Binary Image Format (BIF)
+  v2.0." By default, LLVM IR IS generated.
+* -f[no-]bin-amdil -- Does [not] generate AMD IL in the .amdil section. For more information, see Appendix C, "OpenCL Binary Image Format (BIF) v2.0." By Default, AMD IL is NOT generated.
+* -f[no-]bin-exe -- Does [not] generate the executable (ISA) in .text section.
+  For more information, see Appendix C, "OpenCL Binary Image Format (BIF)
+  v2.0." By default, the executable IS generated.
 * -f[no-]bin-hsail Does [not] generate HSAIL/BRIG in the binary. By default, HSA IL/BRIG is NOT generated.
 
 To avoid source changes,  there are two environment variables that can be used to change CL options during the runtime.
 
-* AMD_OCL_BUILD_OPTIONS — Overrides the CL options specified in clBuildProgram().
-* AMD_OCL_BUILD_OPTIONS_APPEND — Appends options to the options specified in clBuildProgram().
+* AMD_OCL_BUILD_OPTIONS -- Overrides the CL options specified in clBuildProgram().
+* AMD_OCL_BUILD_OPTIONS_APPEND -- Appends options to the options specified in clBuildProgram().
 
 cl_amd_offline_devices
 ***********************
@@ -3137,7 +3137,7 @@ This extension provides the ability to register event callbacks for states other
 
 cl_amd_popcnt
 **************
-This extension introduces a “population count” function called popcnt. This extension was taken into core OpenCL 1.2, and the function was renamed popcount. The core 1.2 popcount function (documented in section 6.12.3 of the OpenCL Specification) is identical to the AMD extension popcnt function.
+This extension introduces a "population count" function called popcnt. This extension was taken into core OpenCL 1.2, and the function was renamed popcount. The core 1.2 popcount function (documented in section 6.12.3 of the OpenCL Specification) is identical to the AMD extension popcnt function.
 
 cl_amd_media_ops
 ******************
@@ -3148,14 +3148,14 @@ This extension adds the following built-in functions to the OpenCL language. Not
  | uint amd_pack(float4 src)
  | Return value
  | ((((uint)src[0]) & 0xFF) << 0) + ((((uint)src[1]) & 0xFF) << 8) + ((((uint)src[2]) & 0xFF) << 16) + ((((uint)src[3]) & 0xFF) << 24)
- 
+
  |
  | Built-in function: amd_unpack0
  | floatn  amd_unpack0 (uintn src)
  | Return value for each vector component
  | (float)(src[i] & 0xFF)
  |
- 
+
  | Built-in function: amd_unpack1
  | floatn  amd_unpack1 (uintn src)
  | Return value for each vector component
@@ -3170,7 +3170,7 @@ This extension adds the following built-in functions to the OpenCL language. Not
  | floatn amd_unpack3(uintn src)
  | Return value for each vector component
  | (float)((src[i] >> 24) & 0xFF)
- 
+
  |
  | Built-in function: amd_bitalign
  | uintn amd_bitalign (uintn src0, uintn src1, uintn src2)
@@ -3185,8 +3185,8 @@ This extension adds the following built-in functions to the OpenCL language. Not
  | Built-in function: amd_lerp
  | uintn amd_lerp (uintn src0, uintn src1, uintn src2)
  | Return value for each vector component
- | (((((src0[i] >> 0) & 0xFF) + ((src1[i] >> 0) & 0xFF) + ((src2[i] >> 0) & 1)) >> 1) << 0) + (((((src0[i] >> 8) & 0xFF) + ((src1[i]  	
- | >> 8) & 0xFF) + ((src2[i] >> 8) & 1)) >> 1) << 8) + (((((src0[i] >> 16) & 0xFF) + ((src1[i] >> 16) & 0xFF) + ((src2[i] >> 16) &   	
+ | (((((src0[i] >> 0) & 0xFF) + ((src1[i] >> 0) & 0xFF) + ((src2[i] >> 0) & 1)) >> 1) << 0) + (((((src0[i] >> 8) & 0xFF) + ((src1[i]
+ | >> 8) & 0xFF) + ((src2[i] >> 8) & 1)) >> 1) << 8) + (((((src0[i] >> 16) & 0xFF) + ((src1[i] >> 16) & 0xFF) + ((src2[i] >> 16) &
  |1)) >> 1) << 16) + (((((src0[i] >> 24) & 0xFF) + ((src1[i] >> 24) & 0xFF) + ((src2[i] >> 24) & 1)) >> 1) << 24) ;
  |
  | Built-in function: amd_sad
@@ -3207,7 +3207,7 @@ This extension adds the following built-in functions to the OpenCL language. Not
  | abs(((src0[i] >> 16) & 0xFF) - ((src1[i] >> 16) & 0xFF)) +
  | abs(((src0[i] >> 24) & 0xFF) - ((src1[i] >> 24) & 0xFF));
  |
- 
+
  | Built-in function: amd_sadhi
  | uintn amd_sadhi (uintn src0, uintn src1, uintn src2)
  | Return value for each vector component
@@ -3220,21 +3220,21 @@ For more information, see: http://www.khronos.org/registry/cl/extensions/amd/cl_
 
 cl_amd_printf
 ****************
-The OpenCL™ Specification 1.1 and 1.2 support the optional AMD extension cl_amd_printf, which provides printf capabilities to OpenCL C programs. To use this extension, an application first must include::
- 
+The OpenCL(TM) Specification 1.1 and 1.2 support the optional AMD extension cl_amd_printf, which provides printf capabilities to OpenCL C programs. To use this extension, an application first must include::
+
  #pragma OPENCL EXTENSION cl_amd_printf : enable.
 
 Built-in function::
- 
- printf( constant char * restrict format, …);
+
+ printf( constant char * restrict format, ...);
 
 This function writes output to the stdout stream associated with the host application. The format string is a character sequence that:
 
-–is null-terminated and composed of zero and more directives,
+-is null-terminated and composed of zero and more directives,
 
-–ordinary characters (i.e. not %), which are copied directly to the output stream unchanged, and
+-ordinary characters (i.e. not %), which are copied directly to the output stream unchanged, and
 
-–conversion specifications, each of which can result in fetching zero or more arguments, converting them, and then writing the final result to the output stream.
+-conversion specifications, each of which can result in fetching zero or more arguments, converting them, and then writing the final result to the output stream.
 
 The format string must be resolvable at compile time; thus, it cannot be dynamically created by the executing program. (Note that the use of variadic arguments in the built-in printf does not imply its use in other built- ins; more importantly, it is not valid to use printf in user-defined functions or kernels.)
 
@@ -3243,55 +3243,55 @@ The OpenCL C printf closely matches the definition found as part of the C99 stan
 * A 32-bit floating point argument is not converted to a 64-bit double, unless the extension cl_khr_fp64 is supported and enabled, as defined in section 9.3 of the OpenCL Specification 1.1. This includes the double variants if cl_khr_fp64 is supported and defined in the corresponding compilation unit.
 * 64-bit integer types can be printed using %ld / %lx / %lu .
 * %lld / %llx / %llu are not supported and reserved for 128-bit integer types (long long).
-* All OpenCL vector types (section 6.1.2 of the OpenCL Specification 1.1) can be explicitly passed and printed using the modifier vn, where n can be 2, 3, 4, 8, or 16. This modifier appears before the original conversion specifier for the vector’s component type (for example, to print a float4 %v4f). Since vn is a conversion specifier, it is valid to apply optional flags, such as field width and precision, just as it is when printing the component types. 	Since a vector is an aggregate type, the comma separator is used between the components: 0:1, … , n-2:n-1.
+* All OpenCL vector types (section 6.1.2 of the OpenCL Specification 1.1) can be explicitly passed and printed using the modifier vn, where n can be 2, 3, 4, 8, or 16. This modifier appears before the original conversion specifier for the vector's component type (for example, to print a float4 %v4f). Since vn is a conversion specifier, it is valid to apply optional flags, such as field width and precision, just as it is when printing the component types. 	Since a vector is an aggregate type, the comma separator is used between the components: 0:1, ... , n-2:n-1.
 
 
 cl_amd_predefined_macros
 *************************
-The following macros are predefined when compiling OpenCL™ C kernels. These macros are defined automatically based on the device for which the code is being compiled.
+The following macros are predefined when compiling OpenCL(TM) C kernels. These macros are defined automatically based on the device for which the code is being compiled.
 
 GPU devices:
 
-  | __Barts__ 	
-  | __Bheem__ 	
-  | __Bonaire__ 	
-  | __Caicos__ 	
-  | __Capeverde__ 	
-  | __Carrizo__ 	
-  | __Cayman__ 	
-  | __Cedar__ 	
-  | __Cypress__ 	
+  | __Barts__
+  | __Bheem__
+  | __Bonaire__
+  | __Caicos__
+  | __Capeverde__
+  | __Carrizo__
+  | __Cayman__
+  | __Cedar__
+  | __Cypress__
   | __Devastator__
-  | __Hainan__ 	
-  | __Iceland__ 	
-  | __Juniper__ 	
-  | __Kalindi__ 	
-  | __Kauai__ 	
-  | __Lombok__ 	
-  | __Loveland__ 	
-  | __Mullins__ 	
-  | __Oland__ 	
-  | __Pitcairn__ 	
-  | __RV710__ 	
-  | __RV730__ 	
-  | __RV740__ 	
-  | __RV770__ 	
-  | __RV790__ 	
-  | __Redwood__ 	
-  | __Scrapper__ 	
-  | __Spectre__ 	
-  | __Spooky__ 	
-  | __Tahiti__ 	
-  | __Tonga__ 	
-  | __Turks__ 	
+  | __Hainan__
+  | __Iceland__
+  | __Juniper__
+  | __Kalindi__
+  | __Kauai__
+  | __Lombok__
+  | __Loveland__
+  | __Mullins__
+  | __Oland__
+  | __Pitcairn__
+  | __RV710__
+  | __RV730__
+  | __RV740__
+  | __RV770__
+  | __RV790__
+  | __Redwood__
+  | __Scrapper__
+  | __Spectre__
+  | __Spooky__
+  | __Tahiti__
+  | __Tonga__
+  | __Turks__
   | __WinterPark__
-  | __GPU__ 	
+  | __GPU__
 
 CPU devices:
 
-  | __CPU__ 	
-  | __X86__ 	
-  | __X86_64__ 	
+  | __CPU__
+  | __X86__
+  | __X86_64__
 
 Note that     GPU  or     CPU  are predefined whenever a GPU or CPU device is the compilation target.
 
@@ -3300,11 +3300,11 @@ An example kernel is provided below.
 ::
 
   #pragma OPENCL EXTENSION cl_amd_printf : enable const char* getDeviceName() {
-  #ifdef   Cayman 	
+  #ifdef   Cayman
   return "Cayman";
-  #elif   Barts 	
+  #elif   Barts
   return "Barts";
-  #elif   Cypress 	
+  #elif   Cypress
   return "Cypress";
   #elif defined(  Juniper  )
   return "Juniper";
@@ -3334,12 +3334,12 @@ An example kernel is provided below.
   return "UnknownDevice";
   kernel void test_pf(global int* a)
   {
-  printf("Device Name: %s\n", getDeviceName()); 
+  printf("Device Name: %s\n", getDeviceName());
   }
 
 cl_amd_bus_addressable_memory
 ******************************
-This extension defines an API for peer-to-peer transfers between AMD GPUs and other PCIe device, such as third-party SDI I/O devices. Peer-to-peer transfers have extremely low latencies by not having to use the host’s main memory or the CPU (see Figure A.1). This extension allows sharing a memory allocated by the graphics driver to be used by other devices on the PCIe bus (peer-to-peer transfers) by exposing a write-only bus address. It also allows memory allocated on other PCIe devices (non-AMD GPU) to be directly accessed by AMD GPUs. One possible use of this is for a video capture device to directly write into the GPU memory using its DMA.This extension is supported only on AMD FirePro™ professional graphics cards.
+This extension defines an API for peer-to-peer transfers between AMD GPUs and other PCIe device, such as third-party SDI I/O devices. Peer-to-peer transfers have extremely low latencies by not having to use the host's main memory or the CPU (see Figure A.1). This extension allows sharing a memory allocated by the graphics driver to be used by other devices on the PCIe bus (peer-to-peer transfers) by exposing a write-only bus address. It also allows memory allocated on other PCIe devices (non-AMD GPU) to be directly accessed by AMD GPUs. One possible use of this is for a video capture device to directly write into the GPU memory using its DMA.This extension is supported only on AMD FirePro(TM) professional graphics cards.
 
 
 .. image:: images/a.1.png
@@ -3367,11 +3367,11 @@ Extensions			   Brazos     Llano      Trinity    Cape Verde3    Turks4      Caym
  cl_khr_byte_addressable_store       Yes       Yes         Yes        Yes            Yes         Yes          Yes        Yes
  cl_ext_device_fission   	   onlyCPU   only CPU    onlyCPU      No              No          No           No         No
  cl_amd_device_attribute_query       Yes       Yes         Yes        Yes            Yes         Yes          Yes        Yes
- cl_khr_fp64  			   onlyCPU   only CPU    onlyCPU      Yes 	     Yes 	 Yes 	       No        Yes 
+ cl_khr_fp64  			   onlyCPU   only CPU    onlyCPU      Yes 	     Yes 	 Yes 	       No        Yes
  cl_amd_fp64  			   onlyCPU   only CPU    onlyCPU      Yes 	     Yes 	 Yes 	       No	 Yes
  cl_amd_vec3 			    Yes       Yes         Yes        Yes            Yes         Yes          Yes        Yes
  cl_khr_d3d10_sharing 		    Yes       Yes         Yes        Yes            Yes         Yes          Yes        Yes
- cl_amd_media_ops 		    Yes       Yes         Yes        Yes            Yes         Yes          Yes        Yes	
+ cl_amd_media_ops 		    Yes       Yes         Yes        Yes            Yes         Yes          Yes        Yes
  cl_amd_printf 			    Yes       Yes         Yes        Yes            Yes         Yes          Yes        Yes
  cl_amd_popcnt 			    Yes       Yes         Yes        Yes            Yes         Yes          Yes        Yes
  cl_khr_3d_image_writes 	    Yes       Yes         Yes        Yes            Yes         Yes          Yes        Yes
@@ -3380,13 +3380,13 @@ Extensions			   Brazos     Llano      Trinity    Cape Verde3    Turks4      Caym
 **Table A.1 Extension Support for AMD GPU Devices 1**
 
 
-1.  AMD Radeon™ HD 79XX series.
-2.  AMD Radeon™ HD 78XX series.
-3.  AMD Radeon™ HD 77XX series.
-4.  AMD Radeon™ HD 75XX series and AMD Radeon™ HD 76XX series.
-5.  AMD Radeon™ HD 69XX series.
-6.  AMD Radeon™ HD 68XX series.
-7.  ATI Radeon™ HD 59XX series and 58XX series, AMD FirePro™ V88XX series and V87XX series.
+1.  AMD Radeon(TM) HD 79XX series.
+2.  AMD Radeon(TM) HD 78XX series.
+3.  AMD Radeon(TM) HD 77XX series.
+4.  AMD Radeon(TM) HD 75XX series and AMD Radeon(TM) HD 76XX series.
+5.  AMD Radeon(TM) HD 69XX series.
+6.  AMD Radeon(TM) HD 68XX series.
+7.  ATI Radeon(TM) HD 59XX series and 58XX series, AMD FirePro(TM) V88XX series and V87XX series.
 
 Note that an atomic counter is a device-level counter that can be added / decremented by different work-items, where the atomicity of the operation is guaranteed. The access to the counter is done only through add/dec built-in functions; thus, no two work-items have the same value returned in the case that a given kernel only increments or decrements the counter. (Also see http://www.khronos.org/registry/cl/extensions/ext/cl_ext_atomic_counters_32.txt.)
 
@@ -3435,13 +3435,13 @@ Note that an atomic counter is a device-level counter that can be added / decrem
 +---------------------------------+------------+-----------+----------+-----------------------------+
 | cl_amd_offline_devices          | Yes        |Yes        | Yes      |           No                |
 +---------------------------------+------------+-----------+----------+-----------------------------+
-	
+
 **Table A.2 Extension Support for Older AMD GPUs and CPUs**
 
 
-1.  ATI Radeon™ HD 5700 series, AMD Mobility Radeon™ HD 5800 series, AMD FirePro™ V5800 series, AMD Mobility FirePro™ M7820.
-2.  ATI Radeon™ HD 5600 Series, ATI Radeon™ HD 5600 Series, ATI Radeon™ HD 5500 Series, AMD Mobility Radeon™ HD 5700 Series, AMD Mobility Radeon™ HD 5600 Series, AMD FirePro™ V4800 Series, AMD FirePro™ V3800 Series, AMD Mobility FirePro™ M5800
-3.  ATI Radeon™ HD 5400 Series, AMD Mobility Radeon™ HD 5400 Series
+1.  ATI Radeon(TM) HD 5700 series, AMD Mobility Radeon(TM) HD 5800 series, AMD FirePro(TM) V5800 series, AMD Mobility FirePro(TM) M7820.
+2.  ATI Radeon(TM) HD 5600 Series, ATI Radeon(TM) HD 5600 Series, ATI Radeon(TM) HD 5500 Series, AMD Mobility Radeon(TM) HD 5700 Series, AMD Mobility Radeon(TM) HD 5600 Series, AMD FirePro(TM) V4800 Series, AMD FirePro(TM) V3800 Series, AMD Mobility FirePro(TM) M5800
+3.  ATI Radeon(TM) HD 5400 Series, AMD Mobility Radeon(TM) HD 5400 Series
 4.  Available on all devices that have double-precision, including all Southern Island devices.
 5.  Environment variable CPU_IMAGE_SUPPORT must be set.
 
@@ -3468,7 +3468,7 @@ Using ICD
 Sample code that is part of the SDK contains examples showing how to query the platform API and call the functions that require a valid platform parameter.
 
 This is a pre-ICD code snippet. ::
- 
+
  context = clCreateContextFromType(0,
                                    dType,
                                    NULL,
@@ -3479,12 +3479,12 @@ This is a pre-ICD code snippet. ::
 The ICD-compliant version of this code follows.
 
 ::
-  
+
   /*
    * Have a look at the available platforms and pick either
    * the AMD one if available or a reasonable default.
   */
-  
+
   cl_uint numPlatforms;
   cl_platform_id platform = NULL;
   status = clGetPlatformIDs(0, NULL, &numPlatforms);
@@ -3523,27 +3523,27 @@ The ICD-compliant version of this code follows.
   get whatever the
   * implementation thinks we should be using.
   */
-  
+
   cl_context_properties cps[3] =
   {
     CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0
   };
   /* Use NULL for backward compatibility */
   cl_context_properties* cprops = (NULL == platform) ? NULL : cps;
-   
+
   context = clCreateContextFromType(cprops, dType, NULL, NULL, &status);
 
 
 Another example of a pre-ICD code snippet follows.
 ::
-  
+
   status = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &numDevices);
-  
+
 
 The ICD-compliant version of the code snippet is::
-  
+
  status= clGetDeviceiDs(platform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &nurnDevices);
- 
+
 
 .. Note:::: It is recommended that the host code look at the platform vendor string when searching for the desired OpenCL platform, instead of using the platform name string. The platform name string might change, whereas the platform vendor string remains constant for a particular vendor's implementation.
 .. _BIF:
@@ -3565,7 +3565,7 @@ The BIF can have other special sections for debugging, etc. It also contains sev
   * .rodata for storing the OpenCL runtime control data.
   * other ELF special sections required for forming an ELF (for example: ``.strtab, .symtab, .shstrtab`` ).
 
-By default, OpenCL generates a binary that has LLVM IR, and the executable for the GPU (,.llvmir, .amdil, and .text sections), as well as LLVM IR and the executable for the CPU (.llvmir and .text sections). The BIF binary always contains a .comment section, which is a readable C string. The default behavior can be changed with the BIF options described in Section C.2, “BIF Options,” page C-3.
+By default, OpenCL generates a binary that has LLVM IR, and the executable for the GPU (,.llvmir, .amdil, and .text sections), as well as LLVM IR and the executable for the CPU (.llvmir and .text sections). The BIF binary always contains a .comment section, which is a readable C string. The default behavior can be changed with the BIF options described in Section C.2, "BIF Options," page C-3.
 
 The LLVM IR enables recompilation from LLVM IR to the target. When a binary is used to run on a device for which the original program was not generated and the original device is feature-compatible with the current device, OpenCL recompiles the LLVM IR to generate a new code for the device. Note that the LLVM IR is only universal within devices that are feature-compatible in the same device type, not across different device types. This means that the LLVM IR for the CPU is not compatible with the LLVM IR for the GPU. The LLVM IR for a GPU works only for GPU devices that have equivalent feature sets.
 
@@ -3646,11 +3646,11 @@ BIF Options
 *************
 OpenCL provides the following options to control what is contained in the binary.
 
--f[no-]bin-source — [not] generate OpenCL source in .source section.
+-f[no-]bin-source -- [not] generate OpenCL source in .source section.
 
--f[no-]bin-llvmir — [not] generate LLVM IR in .llvmir section.
+-f[no-]bin-llvmir -- [not] generate LLVM IR in .llvmir section.
 
--f[no-]bin-exe — [not] generate the executable (ISA) in .text section. The option syntax follows the GCC option syntax.
+-f[no-]bin-exe -- [not] generate the executable (ISA) in .text section. The option syntax follows the GCC option syntax.
 By default, OpenCL generates the .llvmir section, .amdil section, and .text
 section. The following are examples for using these options: Example 1: Generate executable for execution:
 
@@ -3700,7 +3700,7 @@ A processing element is arranged as a five-way or four-way (depending on the
 GPU type) very long instruction word (VLIW) processor (see bottom of
 Figure D.2). Up to five scalar operations (or four, depending on the GPU type) can be co-issued in a VLIW instruction, each of which are executed on one of the corresponding five ALUs. ALUs can execute single-precision floating point or integer operations. One of the five ALUs also can perform transcendental operations (sine, cosine, logarithm, etc.). Double-precision floating point operations are processed (where supported) by connecting two or four of the ALUs (excluding the transcendental core) to perform a single double-precision operation. The processing element also contains one branch execution unit to handle branch instructions.
 
-Different GPU compute devices have different numbers of processing elements. For example, the ATI Radeon™ HD 5870 GPU has 20 compute units, each with
+Different GPU compute devices have different numbers of processing elements. For example, the ATI Radeon(TM) HD 5870 GPU has 20 compute units, each with
 16 processing elements, and each processing elements contains five ALUs; this
 yields 1600 physical ALUs.
 
@@ -4004,7 +4004,7 @@ The following code segment shows how to create an OpenCL-OpenGL interoperability
   glXDestroyContext(glXGetCurrentDisplay(), gGlCtx);
   continue;
   }
-  else 
+  else
   {
   //Interoperable device found std::cout<<"Interoperable device found "<<std::endl; break;
   }
@@ -4025,8 +4025,8 @@ Additional GL Formats Supported
 The following is a list of GL formats beyond the minimum set listed in The OpenCL Extension Specification, v 1.2 that AMD supports.
 
 ==================================== ============================
-AMD-Supported GL Formats 		GL internal format	
-==================================== ============================		
+AMD-Supported GL Formats 		GL internal format
+==================================== ============================
 GL_ALPHA8				 CL_A,CL_UNORM8
 GL_R8, CL_R,				 CL_UNORM_INT8
 GL_R8UI CL_R,				 CL_UNSIGNED_INT8
@@ -4068,11 +4068,11 @@ List of Functions
 
 Work Item Functions
 *********************
-============================== =========================================================================== 
+============================== ===========================================================================
  get_enqueued_local_size 			local sizes in uniform part of NDRange
   get_global_linear_id 				unique 1D index for each work item in the NDRange
   get_local_linear_id 				unique 1D index for each work item in the work group
-============================== =========================================================================== 
+============================== ===========================================================================
 
 Integer functions
 *******************
@@ -4084,12 +4084,12 @@ Synchronization Functions
 
 Address space qualifier functions
 ***********************************
-============================== =========================================================================== 
+============================== ===========================================================================
   to_global 					convert generic pointer to global pointer
   to_local 					convert genericpointer to local pointer
   to_private 					convert generic pointer to private pointer
   get_fence 					get fence appropriate to address space
-============================== =========================================================================== 
+============================== ===========================================================================
 
 
 Atomic functions
@@ -4117,14 +4117,14 @@ Atomic functions
 
 Image Read and Write Functions
 *********************************
- 
+
  | read_imagef 		:			Read from 2D depth [array] image
  | write_imagef 	:			Write to 2D depth [array] image
 
 Work group functions
 *********************
 
-=============================================== =========================================================================== 
+=============================================== ===========================================================================
  work_group_all 				Test all members of work group (and reduction)
  work_group_any 				Test any member of work group (or reduction)
  work_group_broadcast 				Brodcast value to every member of work group
@@ -4137,7 +4137,7 @@ Work group functions
  work_group_scan_inclusive_add Sum inclusive  	scan across work group
  work_group_scan_inclusive_max Max inclusive  	scan across work group
  work_group_scan_inclusive_min Min inclusive  	scan across work group
-=============================================== =========================================================================== 
+=============================================== ===========================================================================
 
 Pipe functions
 ***************
@@ -4204,7 +4204,7 @@ New Types
 =============================================  ===============================================================
   cl_device_svm_capabilities 			Returned by clGetDeviceInfo(...CL_DEVICE_SVM_CAPABILITIES...)
   cl_queue_properties 				See clCreateCommandQueueWithProperties
-  cl_svm_mem_flags 				See clSVMAlloc 
+  cl_svm_mem_flags 				See clSVMAlloc
   cl_pipe_properties 				See clCreatePipe
   cl_pipe_info 					See clGetPipeInfo
   cl_sampler_properties 			See clCreateSamplerWithProperties
diff --git a/Programming_Guides/Programming-Guides.rst b/Programming_Guides/Programming-Guides.rst
index fb467c4a..91ddb60b 100644
--- a/Programming_Guides/Programming-Guides.rst
+++ b/Programming_Guides/Programming-Guides.rst
@@ -32,7 +32,7 @@ The project is based on LLVM+CLANG. For more information, please visit :ref:`HCC
 
 HIP: Heterogeneous-Computing Interface for Portability
 #########################################################
-What is Heterogeneous-Computing Interface for Portability (HIP)? It’s a C++ dialect designed to ease conversion of Cuda applications to portable C++ code. It provides a C-style API and a C++ kernel language. The C++ interface can use templates and classes across the
+What is Heterogeneous-Computing Interface for Portability (HIP)? It's a C++ dialect designed to ease conversion of Cuda applications to portable C++ code. It provides a C-style API and a C++ kernel language. The C++ interface can use templates and classes across the
 host/kernel boundary.
 
 The Hipify tool automates much of the conversion work by performing a source-to-source transformation from Cuda to HIP. HIP code can run on AMD hardware (through the HCC compiler) or Nvidia hardware (through the NVCC compiler) with no performance loss compared with the original Cuda code.
@@ -43,9 +43,9 @@ When to Use HIP
 ****************
 Use HIP when converting Cuda applications to portable C++ and for new projects that require portability between AMD and Nvidia. HIP provides a C++ development language and access to the best development tools on both platforms.
 
-OpenCL™: Open Compute Language
+OpenCL(TM): Open Compute Language
 ################################
-What is OpenCL ?  It’s a framework for developing programs that can execute across a wide variety of heterogeneous platforms. AMD, Intel
+What is OpenCL ?  It's a framework for developing programs that can execute across a wide variety of heterogeneous platforms. AMD, Intel
 and Nvidia GPUs support version 1.2 of the specification, as do x86 CPUs and other devices (including FPGAs and DSPs). OpenCL provides a C run-time API and C99-based kernel language.
 
 When to Use OpenCL
@@ -55,7 +55,7 @@ Windows, Linux and Mac OS, as well as a wide variety of hardware platforms (desc
 
 Anaconda Python With Numba
 ###########################
-What is Anaconda ?  It’s a modern open-source analytics platform powered by Python. Continuum Analytics, a ROCm platform partner,  is the driving force behind it. Anaconda delivers high-performance capabilities including acceleration of HSA APUs, as well as
+What is Anaconda ?  It's a modern open-source analytics platform powered by Python. Continuum Analytics, a ROCm platform partner,  is the driving force behind it. Anaconda delivers high-performance capabilities including acceleration of HSA APUs, as well as
 ROCm-enabled discrete GPUs via Numba. It gives superpowers to the people who are changing the world.
 
 Numba
@@ -67,11 +67,11 @@ performance similar to that of C, C++ and Fortran---without having to switch lan
 Numba works by generating optimized machine code using the LLVM compiler infrastructure at import time, run time or statically
 (through the included Pycc tool). It supports Python compilation to run on either CPU or GPU hardware and is designed to integrate with Python scientific software stacks, such as NumPy.
 
-  * `Anaconda® with Numba acceleration <http://numba.pydata.org/numba-doc/latest/index.html>`_
+  * `Anaconda(R) with Numba acceleration <http://numba.pydata.org/numba-doc/latest/index.html>`_
 
 When to Use Anaconda
 *********************
-Use Anaconda when you’re handling large-scale data-analytics,
+Use Anaconda when you're handling large-scale data-analytics,
 scientific and engineering problems that require you to manipulate
 large data arrays.
 
@@ -193,7 +193,7 @@ HC Programming Guide
 
 **What is the Heterogeneous Compute (HC) API ?**
 
-It’s a C++ dialect with extensions to launch kernels and manage accelerator memory. It closely tracks the evolution of C++ and will incorporate parallelism and concurrency features as the C++ standard does. For example, HC includes early support for the C++17 Parallel STL. At the recent ISO C++ meetings in Kona and Jacksonville, the committee was excited about enabling the language to express all forms of parallelism, including multicore CPU, SIMD and GPU. We’ll be following these developments closely, and you’ll see HC move quickly to include standard C++ capabilities.
+It's a C++ dialect with extensions to launch kernels and manage accelerator memory. It closely tracks the evolution of C++ and will incorporate parallelism and concurrency features as the C++ standard does. For example, HC includes early support for the C++17 Parallel STL. At the recent ISO C++ meetings in Kona and Jacksonville, the committee was excited about enabling the language to express all forms of parallelism, including multicore CPU, SIMD and GPU. We'll be following these developments closely, and you'll see HC move quickly to include standard C++ capabilities.
 
 The Heterogeneous Compute Compiler (HCC) provides two important benefits:
 
@@ -209,7 +209,7 @@ Ease of development
 Full control over the machine
 
 
-    * Access AMD scratchpad memories (“LDS”)
+    * Access AMD scratchpad memories ("LDS")
     * Fully control data movement, prefetch and discard
     * Fully control asynchronous kernel launch and completion
     * Get device-side dependency resolution for kernel and data commands (without host involvement)
@@ -360,7 +360,7 @@ HIP provides a C++ syntax that is suitable for compiling most code that commonly
    * Math functions resembling those in the "math.h" header included with standard C++ compilers
    * Built-in functions for accessing specific GPU hardware capabilities
 
-This section describes the built-in variables and functions accessible from the HIP kernel. It’s intended for readers who are familiar with Cuda kernel syntax and want to understand how HIP is different.
+This section describes the built-in variables and functions accessible from the HIP kernel. It's intended for readers who are familiar with Cuda kernel syntax and want to understand how HIP is different.
 
   * :ref:`HIP-GUIDE`
 
@@ -408,7 +408,7 @@ hipLaunchKernelGGL(vector_square,   /* compute kernel*/
                 dim3(blocks), dim3(threadsPerBlock), 0/*dynamic shared*/, 0/*stream*/,     /* launch config*/
                 C_d, A_d, N);  /* arguments to the compute kernel */
 
-hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost); 
+hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost);
 
 The HIP kernel language defines builtins for determining grid and block coordinates, math functions, short vectors, atomics, and timer functions. It also specifies additional defines and keywords for function types, address spaces, and optimization controls. (See the  :ref:`Kernel_language` for a full description). Here's an example of defining a simple 'vector_square' kernel.
 
@@ -485,6 +485,6 @@ OpenCL Best Practices
 * :ref:`Optimization-Opencl`
 
 
-   
 
-   
+
+
diff --git a/Programming_Guides/hcc-guide.rst b/Programming_Guides/hcc-guide.rst
index da7d1343..b0a87f8c 100644
--- a/Programming_Guides/hcc-guide.rst
+++ b/Programming_Guides/hcc-guide.rst
@@ -22,7 +22,7 @@ The Heterogeneous Compute Compiler (HCC) provides two important benefits:
 
 **Full control over the machine**
 
- * Access AMD scratchpad memories (“LDS”)
+ * Access AMD scratchpad memories ("LDS")
  * Fully control data movement, prefetch and discard
  * Fully control asynchronous kernel launch and completion
  * Get device-side dependency resolution for kernel and data commands (without host involvement)
diff --git a/Programming_Guides/hcc-profile.rst b/Programming_Guides/hcc-profile.rst
index 7e195fc0..80ec2ef0 100644
--- a/Programming_Guides/hcc-profile.rst
+++ b/Programming_Guides/hcc-profile.rst
@@ -27,7 +27,7 @@ Kernel Commands
 ++++++++++++++++
 
 This shows the simplest trace output for kernel commands with no additional verbosity flags::
- 
+
  $ HCC_PROFILE=2 ./my-hcc-app ...
  profile:  kernel;            Im2Col;   17.8 us;
  profile:  kernel;  tg_betac_alphaab;   32.6 us;
@@ -36,7 +36,7 @@ This shows the simplest trace output for kernel commands with no additional verb
 ::
 
   PROFILE:  TYPE;    KERNEL_NAME     ;  DURATION;
- 
+
 This example shows profiled kernel commands with full verbose output::
 
  $ HCC_PROFILE=2 HCC_PROFILE_VERBOSE=0xf ./my-hcc-app ...
@@ -77,7 +77,7 @@ This example shows memory copy commands with full verbose output:
 	* Sync or Async. Synchronous copies indicate the host waits for the completion for the copy. Asynchronous copies are launched by the host without waiting for the copy to complete.
 	* Fast or Slow. Fast copies use the GPUs optimized copy routines from the hsa_amd_memory_copy routine. Slow copies typically involve unpinned host memory and can't take the fast path.
 	* For example `HostToDevice_async_fast.
-	
+
 * DURATION: command duration measured in us. This is measured using the GPU timestamps and represents the command execution on the acclerator device.
 * START: command start time in ns. (if HCC_PROFILE_VERBOSE & 0x2)
 * STOP: command stop time in ns. (if HCC_PROFILE_VERBOSE & 0x2)
@@ -94,7 +94,7 @@ Barrier commands are only enabled if HCC_PROFILE_VERBOSE 0x
 An example barrier command with full vebosity::
 
  profile: barrier; deps:0_acq:none_rel:sys;  5.3 us;   94858731419410; 94858731424690; #0.0.2;
- PROFILE:  TYPE;   BARRIER_NAME           ;  DURATION; START         ; STOP          ; ID    ; 
+ PROFILE:  TYPE;   BARRIER_NAME           ;  DURATION; START         ; STOP          ; ID    ;
 
 * PROFILE: always "profile:" to distinguish it from other output.
 * TYPE: the command type: either kernel, copy, copyslo, or barrier. The examples and descriptions in this section are all copy commands. Copy indicates that the runtime used a call to the fast hsa memory copy routine while copyslo indicates that the copy was implemented with staging buffers or another less optimal path. copy computes the commands using device-side timestamps while copyslo computes the bandwidth based on host timestamps.
diff --git a/Programming_Guides/hip-programming-guide.rst b/Programming_Guides/hip-programming-guide.rst
index 5114b8ca..5762cc35 100644
--- a/Programming_Guides/hip-programming-guide.rst
+++ b/Programming_Guides/hip-programming-guide.rst
@@ -56,7 +56,7 @@ hipEventSynchronize
 
 Developers can control the release scope for hipEvents:
 
-* By default, the GPU performs a device-scope acquire and release operation with each recorded event.  This will make host and device memory visible to other commands executing on the same device. 
+* By default, the GPU performs a device-scope acquire and release operation with each recorded event.  This will make host and device memory visible to other commands executing on the same device.
 
 A stronger system-level fence can be specified when the event is created with hipEventCreateWithFlags:
 
@@ -103,4 +103,4 @@ By default staging buffers are used for unpinned memory transfers. Environment v
    * HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE - Threshold in bytes for H2D copy. For sizes smaller than threshold staging buffers logic would be used else PinInPlace logic.
    * HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING - Threshold in bytes for H2D copy. For sizes smaller than threshold direct copy logic would be used else staging buffers logic.
    * HIP_D2H_MEM_TRANSFER_THRESHOLD - Threshold in bytes for D2H copy. For sizes smaller than threshold staging buffer logic would be used else PinInPlace logic.
-  
+
diff --git a/Programming_Guides/hip-programming.rst b/Programming_Guides/hip-programming.rst
index 487670d1..3f05b811 100644
--- a/Programming_Guides/hip-programming.rst
+++ b/Programming_Guides/hip-programming.rst
@@ -1,6 +1,6 @@
 
 .. _hip-pro:
- 
+
 #####################
 HIP Programming Guide
 #####################
@@ -10,7 +10,7 @@ Host Memory
 
 Introduction
 -------------
- 
+
 hipHostMalloc allocates pinned host memory which is mapped into the address space of all GPUs in the system. There are two use cases for this host memory:
 
  * Faster HostToDevice and DeviceToHost Data Transfers: The runtime tracks the hipHostMalloc allocations and can avoid some of the setup required for regular unpinned memory. For exact measurements on a specific system, experiment with --unpinned and --pinned switches for the hipBusBandwidth tool.
@@ -55,7 +55,7 @@ hipEventSynchronize
 
 Developers can control the release scope for hipEvents:
 
- * By default, the GPU performs a device-scope acquire and release operation with each recorded event.  This will make host and device memory visible to other commands executing on the same device. 
+ * By default, the GPU performs a device-scope acquire and release operation with each recorded event.  This will make host and device memory visible to other commands executing on the same device.
 
 A stronger system-level fence can be specified when the event is created with hipEventCreateWithFlags:
 
diff --git a/Programming_Guides/hip_install.rst b/Programming_Guides/hip_install.rst
index 05ddcb54..350dae9d 100644
--- a/Programming_Guides/hip_install.rst
+++ b/Programming_Guides/hip_install.rst
@@ -96,7 +96,7 @@ By default, HIP uses HCC to compile programs. To use HIP-Clang, add -DHIP_COMPIL
   cd HIP
   mkdir build
   cd build
-  cmake .. 
+  cmake ..
   make
   make install
 
@@ -111,7 +111,7 @@ Here's a richer command-line that overrides the default paths:
 
   cd HIP
   mkdir build
-  cd build  
+  cd build
   cmake -DHSA_PATH=/path/to/hsa -DHCC_HOME=/path/to/hcc -DCMAKE_INSTALL_PREFIX=/where/to/install/hip -DCMAKE_BUILD_TYPE=Release ..
   make
   make install
diff --git a/Programming_Guides/hip_port.rst b/Programming_Guides/hip_port.rst
index 59d5c498..75f8a786 100644
--- a/Programming_Guides/hip_port.rst
+++ b/Programming_Guides/hip_port.rst
@@ -44,7 +44,7 @@ Like the CUDA Driver API, the Module API provides additional control over how co
 
 ============    =================================       ==================    =================      ===========
 Format  	APIs    	                          NVCC                 	HCC 	             HIP-CLANG
-============    =================================       ==================    =================      ===========              
+============    =================================       ==================    =================      ===========
 Code Object 	hipModuleLoad, hipModuleLoadData 	.cubin or PTX text    .hsaco 	              .hsaco
 Fat Binary 	hipModuleLoadFatBin 	                .fatbin 	      Under Development       .hip_fatbin
 ============    =================================       ==================    =================       ===========
@@ -115,10 +115,10 @@ CUDA applications may want to mix CUDA driver code with HIP code (see example be
 ==============  =============== ===================
 HIP Type 	CU Driver Type 	CUDA Runtime Type
 ==============  =============== ===================
-hipModule_t 	CUmodule 	
-hipFunction_t 	CUfunction 	
-hipCtx_t 	CUcontext 	
-hipDevice_t 	CUdevice 	
+hipModule_t 	CUmodule
+hipFunction_t 	CUfunction
+hipCtx_t 	CUcontext
+hipDevice_t 	CUdevice
 hipStream_t 	CUstream 	cudaStream_t
 hipEvent_t 	CUevent 	cudaEvent_t
 hipArray 	CUarray 	cudaArray
@@ -227,7 +227,7 @@ The below sample shows how to use hipModuleGetFunction.
       std::vector<void*>argBuffer(2);
       memcpy(&argBuffer[0], &Ad, sizeof(void*));
       memcpy(&argBuffer[1], &Bd, sizeof(void*));
- 
+
       size_t size = argBuffer.size()*sizeof(void*);
 
       void *config[] = {
@@ -274,7 +274,7 @@ HIP supports texture driver APIs however texture reference should be declared in
 
   texture<float, 2, hipReadModeElementType> tex;
 
-  void myFunc () 
+  void myFunc ()
   {
       // ...
 
diff --git a/Programming_Guides/hip_profiling.rst b/Programming_Guides/hip_profiling.rst
index 8c167d3e..9cea76ec 100644
--- a/Programming_Guides/hip_profiling.rst
+++ b/Programming_Guides/hip_profiling.rst
@@ -1,5 +1,5 @@
 
-.. _hip_profiling: 
+.. _hip_profiling:
 
 ###################
 Profiling HIP Code
@@ -25,12 +25,12 @@ Profiling information can viewed in the CodeXL visualization tool or printed dir
     	* :ref:`How to enable profiling at HIP build time`
 
 * :ref:`Tracing and Debug`
-  
+
   * :ref:`Tracing HIP APIs`
-    	* :ref:`Color`    	
-    	
-    	
-    	
+    	* :ref:`Color`
+
+
+
 .. _CodeXL Profiling:
 
 CodeXL Profiling
@@ -57,7 +57,7 @@ Using rocm-profiler performance counter collection
 
 rocm-profiler can record performance counter information to provide greater insight inside a kernel, such as the memory bandwidth, ALU busy percentage, and cache statistics. Collecting the common set of useful counters requires passing the counter configuration files for two passes:
 ::
- 
+
  $ /opt/rocm/bin/rocm-profiler -C -O --counterfile /opt/rocm/profiler/counterfiles/counters_HSA_Fiji_pass1 --counterfile /opt/rocm/profiler/counterfiles/counters_HSA_Fiji_pass2  <applicationName> <applicationArguments>
 
 .. _Using CodeXL to view profiling results:
@@ -109,7 +109,7 @@ HIP can generate markers at function beginning and end which are displayed on th
  # Use profile to generate timeline view:
  export HIP_PROFILE_API=1
  $ /opt/rocm/bin/rocm-profiler -A -T <applicationName> <applicationArguments>
- 
+
  Or
  $ /opt/rocm/bin/rocm-profiler -e HIP_PROFILE_API=1 -A -T <applicationName> <applicationArguments>
 
@@ -130,32 +130,32 @@ Markers can be used to define application-specific events that will be recorded
 Markers have a specific begin and end time, and can be nested. Nested calls are displayed hierarchically in the CodeXL GUI, with each level of the hierarchy occupying a different row.
 
 The HIP APis are defined in "hip_profile.h"::
- 
- #include <hip/hip_profile.h> 
- 
+
+ #include <hip/hip_profile.h>
+
  HIP_BEGIN_MARKER(const char *markerName, const char *groupName);
- HIP_END_MARKER(); 
- 
+ HIP_END_MARKER();
+
  HIP_BEGIN_MARKER("Setup", "MyAppGroup");
  // ...
  // application code for setup
  // ...
  HIP_END_MARKER();
- 
+
 For C++ codes, HIP also provides a scoped marker which records the start time when constructed and the end time when the scoped marker is destructed at the end of the scope. This provides a convenient, single-line mechanism to record an event that neatly corresponds to a region of code.
 ::
 
- void FunctionFoo(...) 
+ void FunctionFoo(...)
  {
-   HIP_SCOPED_MARKER("FunctionFoo", "MyAppGroup"); // Marker starts recording here. 
- 
+   HIP_SCOPED_MARKER("FunctionFoo", "MyAppGroup"); // Marker starts recording here.
+
    // ...
    // Function implementation
-   // ... 
- 
+   // ...
+
    // Marker destroyed here and records end time stamp.
  };
- 
+
 The HIP marker API is only supported on ROCm platform. The marker macros are defined on CUDA platforms and will compile, but are silently ignored at runtime.
 
 This `HIP sample <http://rocm-documentation.readthedocs.io/en/latest/Programming_Guides/hip_profiling.html#profiling-hip-apis>`_ shows the profiler marker API used in a small application.
@@ -177,21 +177,21 @@ Demangling C++ Kernel Names
 
 HIP includes the ``hipdemangleatp`` tool which can post-process an ATP file to "demangle" C++ names. Mangled kernel names encode the C++ arguments and other information, and are guaranteed to be unique even for cases such as operator overloading. However, the mangled names can be quite verbose. For example:
 ::
-  
+
  ZZ39gemm_NoTransA_MICRO_NBK_M_N_K_TS16XMTS4RN2hc16accelerator_viewEPKflS3_lPfliiiiiiffEN3_EC__719__cxxamp_trampolineElililiiiiiiS3_iS3_S4_ff
 
 **hipdemangleatp** will convert this into the more readable::
- 
+
  gemm_NoTransA_MICRO_NBK_M_N_K_TS16XMTS4
 
 The hipdemangleatp tool operates on the ATP file "in-place" and thus replaces the input file with the demangled version.
 ::
- 
+
  $ hipdemangleatp myfile.atp
 
 The kernel name is also shown in some of the summary htlm files (Top10 kernels). These can be regenerated from the demangled ATP file by re-running rocm-profiler:
 ::
- 
+
  $ rocm-profiler -T --atpfile myfile.atp
 
 A future version of CodeXL may directly integrate demangle functionality.
@@ -230,7 +230,7 @@ Reducing timeline trace output file size
 If the application is already recording the HIP APIs, the HSA APIs are somewhat redundant and the ATP file size can be substantially reduced by not recording these APIs. HIP includes a text file that lists all of the HSA APIs and can assist in this filtering:
 ::
 
- $ rocm-profiler -F hip/bin/hsa-api-filter-cxl.txt 
+ $ rocm-profiler -F hip/bin/hsa-api-filter-cxl.txt
 
 This file can be copied and edited to provide more selective HSA event recording.
 
@@ -246,8 +246,8 @@ Recent pre-built packages of HIP are always built with profiling support enabled
       $ mkdir build && cd build
       $ cmake .. -DCOMPILE_HIP_ATP_MARKER
       $ make install
-      
- 
+
+
  2. Install ROCm-Profiler Installing HIP from the `rocm <http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/>`_ pre-built packages, installs the ROCm-Profiler as well. Alternatively, you can build ROCm-Profiler using the instructions here.
 
  3. Recompile the target application
@@ -277,7 +277,7 @@ The HIP runtime can print the HIP function strings to stderr using HIP_TRACE_API
 
 Heres a specific example showing the output of the square program running on HIP::
 
- $ HIP_TRACE_API=1  ./square.hip.out 
+ $ HIP_TRACE_API=1  ./square.hip.out
    hip-api tid:1:HIP initialized short_tid#1 (maps to full_tid: 0x7f6183b097c0)
  <<hip-api tid:1.1 hipGetDeviceProperties (0x7ffddb673e08, 0)
    hip-api tid:1.1 hipGetDeviceProperties         ret= 0 (hipSuccess)>>
@@ -300,7 +300,7 @@ Heres a specific example showing the output of the square program running on HIP
  PASSED!
 
 HIP_TRACE_API supports multiple levels of debug information:
- 
+
  * 0x1 = print all HIP APIs. This is the most verbose setting; the flags below allow selecting a subset.
  * 0x2 = print HIP APIs which initiate GPU kernel commands. Includes hipLaunchKernelGGL, hipLaunchModuleKernel
  * 0x4 = print HIP APIs which initiate GPU memory commands. Includes hipMemcpy*, hipMemset*.
diff --git a/Programming_Guides/hipporting-driver-api.rst b/Programming_Guides/hipporting-driver-api.rst
index 9de8c622..42892fa7 100644
--- a/Programming_Guides/hipporting-driver-api.rst
+++ b/Programming_Guides/hipporting-driver-api.rst
@@ -11,7 +11,7 @@ CUDA provides a separate CUDA Driver and Runtime APIs. The two APIs have signifi
 * Both APIs support events, streams, memory management, memory copy, and error handling.
 * Both APIs deliver similar performance.
 * Driver APIs calls begin with the prefix **cu** while Runtime APIs begin with the prefix cuda. For example, the Driver API API contains 'cuEventCreate' while the Runtime API contains 'cudaEventCreate', with similar functionality.
-* The Driver API defines a different but largely overlapping error code space than the Runtime API, and uses a different coding convention. For example, Driver API defines ``CUDA_ERROR_INVALID_VALUE`` while the Runtime API defines ``cudaErrorInvalidValue`` 
+* The Driver API defines a different but largely overlapping error code space than the Runtime API, and uses a different coding convention. For example, Driver API defines ``CUDA_ERROR_INVALID_VALUE`` while the Runtime API defines ``cudaErrorInvalidValue``
 
 The Driver API offers two additional pieces of functionality not provided by the Runtime API: cuModule and cuCtx APIs.
 
@@ -142,78 +142,78 @@ The ``hipModule_t`` interface does not support ``cuModuleLoadDataEx`` function,
 For example (CUDA)::
 
  CUmodule module;
- void *imagePtr = ...;  // Somehow populate data pointer with code object 
- 
+ void *imagePtr = ...;  // Somehow populate data pointer with code object
+
  const int numOptions = 1;
  CUJit_option options[numOptions];
- void * optionValues[numOptions]; 
- 
+ void * optionValues[numOptions];
+
  options[0] = CU_JIT_MAX_REGISTERS;
  unsigned maxRegs = 15;
- optionValues[0] = (void*)(&maxRegs); 
- 
- cuModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues); 
- 
+ optionValues[0] = (void*)(&maxRegs);
+
+ cuModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues);
+
  CUfunction k;
  cuModuleGetFunction(&k, module, "myKernel");
- 
 
-HIP:: 
+
+HIP::
 
  hipModule_t module;
- void *imagePtr = ...;  // Somehow populate data pointer with code object 
- 
+ void *imagePtr = ...;  // Somehow populate data pointer with code object
+
  const int numOptions = 1;
  hipJitOption options[numOptions];
- void * optionValues[numOptions]; 
- 
+ void * optionValues[numOptions];
+
  options[0] = hipJitOptionMaxRegisters;
  unsigned maxRegs = 15;
- optionValues[0] = (void*)(&maxRegs); 
- 
+ optionValues[0] = (void*)(&maxRegs);
+
  // hipModuleLoadData(module, imagePtr) will be called on HCC path, JIT options will not be used, and
  // cupModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues) will be called on NVCC path
  hipModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues);
- 
+
  hipFunction_t k;
  hipModuleGetFunction(&k, module, "myKernel");
- 
 
-The below sample shows how to use hipModuleGetFunction. 
+
+The below sample shows how to use hipModuleGetFunction.
 ::
-  
+
  #include<hip_runtime.h>
  #include<hip_runtime_api.h>
  #include<iostream>
  #include<fstream>
  #include<vector>
- 
+
  #define LEN 64
- #define SIZE LEN<<2 
- 
+ #define SIZE LEN<<2
+
  #ifdef __HIP_PLATFORM_HCC__
  #define fileName "vcpy_isa.co"
  #endif
- 
+
  #ifdef __HIP_PLATFORM_NVCC__
  #define fileName "vcpy_isa.ptx"
- #endif 
- 
+ #endif
+
  #define kernel_name "hello_world"
- 
+
  int main(){
      float *A, *B;
      hipDeviceptr_t Ad, Bd;
      A = new float[LEN];
      B = new float[LEN];
- 
+
      for(uint32_t i=0;i<LEN;i++){
          A[i] = i*1.0f;
          B[i] = 0.0f;
          std::cout<<A[i] << " "<<B[i]<<std::endl;
      }
- 
- 
+
+
  #ifdef __HIP_PLATFORM_NVCC__
            hipInit(0);
            hipDevice_t device;
@@ -221,43 +221,43 @@ The below sample shows how to use hipModuleGetFunction.
            hipDeviceGet(&device, 0);
            hipCtxCreate(&context, 0, device);
  #endif
- 
+
      hipMalloc((void**)&Ad, SIZE);
      hipMalloc((void**)&Bd, SIZE);
- 
+
      hipMemcpyHtoD(Ad, A, SIZE);
      hipMemcpyHtoD(Bd, B, SIZE);
      hipModule_t Module;
      hipFunction_t Function;
      hipModuleLoad(&Module, fileName);
      hipModuleGetFunction(&Function, Module, kernel_name);
- 
+
      std::vector<void*>argBuffer(2);
      memcpy(&argBuffer[0], &Ad, sizeof(void*));
      memcpy(&argBuffer[1], &Bd, sizeof(void*));
- 
+
      size_t size = argBuffer.size()*sizeof(void*);
- 
+
      void *config[] = {
        HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0],
        HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
        HIP_LAUNCH_PARAM_END
      };
- 
+
      hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config);
- 
+
      hipMemcpyDtoH(B, Bd, SIZE);
      for(uint32_t i=0;i<LEN;i++){
          std::cout<<A[i]<<" - "<<B[i]<<std::endl;
      }
-  
+
  #ifdef __HIP_PLATFORM_NVCC__
            hipCtxDetach(context);
  #endif
- 
+
      return 0;
  }
- 
+
 
 HIP Module and Texture Driver API
 ++++++++++++++++++++++++++++++++++
@@ -284,7 +284,7 @@ HIP supports texture driver APIs however texture reference should be declared in
 
   texture<float, 2, hipReadModeElementType> tex;
 
-  void myFunc () 
+  void myFunc ()
    {
     // ...
 
diff --git a/README.md b/README.md
index 979789b9..e1c99092 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 
- ## ROCm Documentation 
- 
- Repository for ROCm documentation rendered by ReadtheDocs 
+ ## ROCm Documentation
+
+ Repository for ROCm documentation rendered by ReadtheDocs
 
diff --git a/ROCm.rst b/ROCm.rst
index 6e7d23ab..1dbce90e 100644
--- a/ROCm.rst
+++ b/ROCm.rst
@@ -33,7 +33,7 @@ This guide provides documentation on the ROCm programming model and programming
 * listings of supported mathematical functions
 * C++ features supported in host and device code
 * technical specifications of various devices
-* introduction to the low-level driver API 
+* introduction to the low-level driver API
 
 
 | -  `ROCm Languages <http://rocm-documentation.readthedocs.io/en/latest/Programming_Guides/Programming-Guides.html#rocm-languages>`_
@@ -69,11 +69,11 @@ Performance and optimization for various device types such as GCN devices
 
 `GCN ISA Manuals <http://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/GCN-ISA-Manuals.html#gcn-isa-manuals>`_
 
-* `GCN 1.1 <http://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/GCN-ISA-Manuals.html#gcn-1-1>`_  - For information on ISA Manual for Hawaii (Sea Islands Series Instruction Set Architecture) 
+* `GCN 1.1 <http://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/GCN-ISA-Manuals.html#gcn-1-1>`_  - For information on ISA Manual for Hawaii (Sea Islands Series Instruction Set Architecture)
 
 * `GCN 2.0 <http://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/GCN-ISA-Manuals.html#gcn-2-0>`_  - For information on ISA Manual for Fiji and Polaris (AMD Accelerated Parallel Processing technology)
 
-* `Vega <http://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/GCN-ISA-Manuals.html#vega>`_  - Provides “Vega” Instruction Set Architecture, Program Organization, Mode register and more details. 	
+* `Vega <http://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/GCN-ISA-Manuals.html#vega>`_  - Provides "Vega" Instruction Set Architecture, Program Organization, Mode register and more details.
 
 * `Inline GCN ISA Assembly Guide <http://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/GCN-ISA-Manuals.html#inline-gcn-isa-assembly-guide>`_ - Covers various concepts of AMDGCN Assembly, DS Permute Instructions, Parameters to a Kernel, GPR Counting.
 
@@ -81,7 +81,7 @@ Performance and optimization for various device types such as GCN devices
 
 `ROCm API References <http://rocm-documentation.readthedocs.io/en/latest/ROCm_API_References/ROCm-API-References.html#rocm-api-references>`_
 
-*  `ROCr System Runtime API <http://rocm-documentation.readthedocs.io/en/latest/ROCm_API_References/ROCm-API-References.html#rocr-system-runtime-api>`_ 
+*  `ROCr System Runtime API <http://rocm-documentation.readthedocs.io/en/latest/ROCm_API_References/ROCm-API-References.html#rocr-system-runtime-api>`_
 
 * `HCC Language Runtime API <http://rocm-documentation.readthedocs.io/en/latest/ROCm_API_References/ROCm-API-References.html#hcc-language-runtime-api>`_
 
@@ -93,7 +93,7 @@ Performance and optimization for various device types such as GCN devices
 
 * `Math Library API <http://rocm-documentation.readthedocs.io/en/latest/ROCm_API_References/ROCm-API-References.html#math-library-api-s>`_ - Includes HIP MAth API with hcRNG, clBLAS, clSPARSE APIs
 
-* `Deep Learning API <http://rocm-documentation.readthedocs.io/en/latest/ROCm_API_References/ROCm-API-References.html#deep-learning-api-s>`_ - Includes MIOpen API and MIOpenGEMM APIs	
+* `Deep Learning API <http://rocm-documentation.readthedocs.io/en/latest/ROCm_API_References/ROCm-API-References.html#deep-learning-api-s>`_ - Includes MIOpen API and MIOpenGEMM APIs
 
 
 
@@ -104,7 +104,7 @@ Performance and optimization for various device types such as GCN devices
 
 * `GCN Assembler and Disassembler <http://rocm-documentation.readthedocs.io/en/latest/ROCm_Tools/ROCm-Tools.html#gcn-assembler-and-disassembler>`_
 
-* `GCN Assembler Tools <http://rocm-documentation.readthedocs.io/en/latest/ROCm_Tools/ROCm-Tools.html#gcn-assembler-tools>`_  - AMDGPU ISA Assembler 
+* `GCN Assembler Tools <http://rocm-documentation.readthedocs.io/en/latest/ROCm_Tools/ROCm-Tools.html#gcn-assembler-tools>`_  - AMDGPU ISA Assembler
 
 * `ROCm-GDB <http://rocm-documentation.readthedocs.io/en/latest/ROCm_Tools/ROCm-Tools.html#rocm-gdb>`_ - ROCm-GDB tool includes installtion, configuration, and working of Debugger and APIs
 
@@ -112,20 +112,20 @@ Performance and optimization for various device types such as GCN devices
 
 * `ROCm-Tracer <https://rocm-documentation.readthedocs.io/en/latest/ROCm_Tools/ROCm-Tools.html#roc-tracer>`_ - ROCm Tracer - provides a generic independent from specific runtime profiler to trace API and asynchronous activity. Includes details on library source tree, steps to build and run the test
 
-* `CodeXL <http://rocm-documentation.readthedocs.io/en/latest/ROCm_Tools/ROCm-Tools.html#codexl>`_ 
+* `CodeXL <http://rocm-documentation.readthedocs.io/en/latest/ROCm_Tools/ROCm-Tools.html#codexl>`_
 
 * `GPUperfAPI <http://rocm-documentation.readthedocs.io/en/latest/ROCm_Tools/ROCm-Tools.html#gpuperfapi>`_ - GPU Performance API, cloning, system requiments, and source code directory layout
 
 
 
-`AOMP <https://rocm-documentation.readthedocs.io/en/latest/ROCm_Tools/ROCm-Tools.html#aomp-v-0-7-5>`_ 
+`AOMP <https://rocm-documentation.readthedocs.io/en/latest/ROCm_Tools/ROCm-Tools.html#aomp-v-0-7-5>`_
 
 Provides details on AOMP, a scripted build of LLVM and supporting software. Supports OpenMP target offload on AMD GPUs. Since AOMP is a clang/llvm compiler, it also supports GPU offloading with HIP, CUDA, and OpenCL.
 
 
-`ROCmValidationSuite <https://rocm-documentation.readthedocs.io/en/latest/ROCm_Tools/ROCm-Tools.html#rocmvalidationsuite>`_ 
+`ROCmValidationSuite <https://rocm-documentation.readthedocs.io/en/latest/ROCm_Tools/ROCm-Tools.html#rocmvalidationsuite>`_
 
-Provides details on ROCm Validation Suite (RVS), a system administrator’s and cluster manager’s tool for detecting and troubleshooting common problems affecting AMD GPU(s) running in a high-performance computing environment, enabled using the ROCm software stack on a compatible platform.
+Provides details on ROCm Validation Suite (RVS), a system administrator's and cluster manager's tool for detecting and troubleshooting common problems affecting AMD GPU(s) running in a high-performance computing environment, enabled using the ROCm software stack on a compatible platform.
 
 |
 
@@ -135,7 +135,7 @@ Provides details on ROCm Validation Suite (RVS), a system administrator’s and
 | This section provides details on rocFFT,it is a AMD's software library compiled with the CUDA compiler using HIP tools for running on Nvidia GPU devices.
 
 | `rocBLAS <https://rocm-documentation.readthedocs.io/en/latest/ROCm_Tools/rocblas.html#rocblas>`_
-| This section provides details on rocBLAS, it is a library for BLAS on ROCm.rocBLAS is implemented in the HIP programming language and optimized for AMD’s latest discrete GPUs.
+| This section provides details on rocBLAS, it is a library for BLAS on ROCm.rocBLAS is implemented in the HIP programming language and optimized for AMD's latest discrete GPUs.
 
 | `hipBLAS <https://rocm-documentation.readthedocs.io/en/latest/ROCm_Tools/hipBLAS.html#hip8las>`_
 | This section provides details on hipBLAS, it is a BLAS marshalling library, with multiple supported backends. hipBLAS exports an interface that does not require the client to change. Currently,it supports :ref:`rocblas` and cuBLAS as backends.
@@ -153,7 +153,7 @@ Provides details on ROCm Validation Suite (RVS), a system administrator’s and
 | This section provides details on clBLAS. It makes easier for developers to utilize the inherent performance and power efficiency benefits of heterogeneous computing.
 
 | `clSPARSE <https://rocm-documentation.readthedocs.io/en/latest/ROCm_Tools/clSPARSE.html#clsparse1>`_
-| This section provides details on clSPARSE, it is an OpenCL library which implements Sparse linear algebra routines. 
+| This section provides details on clSPARSE, it is an OpenCL library which implements Sparse linear algebra routines.
 
 | `clRNG <https://rocm-documentation.readthedocs.io/en/latest/ROCm_Tools/clRNG.html#cl1rng>`_
 | This section provides details on clRNG,This is a library  for uniform random number generation in OpenCL.
@@ -171,10 +171,10 @@ Provides details on ROCm Validation Suite (RVS), a system administrator’s and
 | This section provides details on rocSPARSE.It is a library that contains basic linear algebra subroutines for sparse matrices and vectors written in HiP for GPU devices. It is designed to be used from C and C++ code.
 
 | `rocThrust <https://rocm-documentation.readthedocs.io/en/latest/ROCm_Libraries/ROCm_Libraries.html#rocthrust>`_
-| This section provides details on rocThrust. It is a parallel algorithmn library.  
+| This section provides details on rocThrust. It is a parallel algorithmn library.
 
-| `hipCUB <https://rocm-documentation.readthedocs.io/en/latest/ROCm_Libraries/ROCm_Libraries.html#hipcub>`_ This section provides details on   hipCUB. 
-| It is a thin wrapper library on top of rocPRIM or CUB. It enables developers to port the project using CUB library to the HIP layer and to 
+| `hipCUB <https://rocm-documentation.readthedocs.io/en/latest/ROCm_Libraries/ROCm_Libraries.html#hipcub>`_ This section provides details on   hipCUB.
+| It is a thin wrapper library on top of rocPRIM or CUB. It enables developers to port the project using CUB library to the HIP layer and to
 | run them on AMD hardware.
 
 | `ROCm SMI Library <https://rocm-documentation.readthedocs.io/en/latest/ROCm_Libraries/ROCm_Libraries.html#rocm-smi-library>`_ This section provides details on ROCm SMI library. The ROCm System Management Interface Library, or ROCm SMI library is part of the Radeon Open Compute ROCm software stack. It is a C library for linux that provides a user space interface for applications to monitor and control GPU aplications.
@@ -182,7 +182,7 @@ Provides details on ROCm Validation Suite (RVS), a system administrator’s and
 | `RCCL <https://rocm-documentation.readthedocs.io/en/latest/ROCm_Libraries/ROCm_Libraries.html#rccl>`_ This section provides details on ROCm Communications Collectives Library. It is a stand alone library of standard collective communication routines for GPUS, implememting all-reduce, all gather, reduce, broadcast, and reduce scatter.
 
 | `AMD MivisionX <https://rocm-documentation.readthedocs.io/en/latest/ROCm_Libraries/ROCm_Libraries.html#amd-migraphx>`_
-This section provides information on AMD’s graph optimization engine.
+This section provides information on AMD's graph optimization engine.
 
 
 `ROCm Compiler SDK <http://rocm-documentation.readthedocs.io/en/latest/ROCm_Compiler_SDK/ROCm-Compiler-SDK.html#rocm-compiler-sdk>`_
@@ -192,7 +192,7 @@ This section provides information on AMD’s graph optimization engine.
 
 | `ROCm Code Object Format <http://rocm-documentation.readthedocs.io/en/latest/ROCm_Compiler_SDK/ROCm-Compiler-SDK.html#rocm-code-object-format>`_
 | This section describes about application binary interface (ABI) provided by the AMD, implementation of the HSA runtime. It also provides details on Kernel, AMD Queue and Signals.
- 
+
 | `ROCm Device Library <http://rocm-documentation.readthedocs.io/en/latest/ROCm_Compiler_SDK/ROCm-Compiler-SDK.html#roc-device-library>`_
 | Documentation on instruction related to ROCm Device Library overview,Building and Testing related information with respect to Device Library is provided.
 
@@ -225,7 +225,7 @@ This section provides information on AMD’s graph optimization engine.
 | ROCmRDMA is the solution designed to allow third-party kernel drivers to utilize DMA access to the GPU memory. Complete indoemation related to ROCmRDMA is Documented here.
 
 | `UCX <http://rocm-documentation.readthedocs.io/en/latest/Remote_Device_Programming/Remote-Device-Programming.html#ucx>`_
-| This section gives information related to UCX, How to install, Running UCX and much more 
+| This section gives information related to UCX, How to install, Running UCX and much more
 
 | `MPI <http://rocm-documentation.readthedocs.io/en/latest/Remote_Device_Programming/Remote-Device-Programming.html#mpi>`_
 | This section gives information related to MPI.
diff --git a/ROCm_API_References/BLAS1.rst b/ROCm_API_References/BLAS1.rst
index 9e97ec07..e554efa9 100644
--- a/ROCm_API_References/BLAS1.rst
+++ b/ROCm_API_References/BLAS1.rst
@@ -7,7 +7,7 @@ BLAS1 functions
 SWAP - Swap elements from 2 vectors
 ------------------------------------
 .. doxygenfunction:: clblasCswap()
- 
+
 .. doxygenfunction:: clblasDswap()
 
 .. doxygenfunction:: clblasSswap()
@@ -30,7 +30,7 @@ SSCAL - Scales a complex vector by a real constant
 .. doxygenfunction:: clblasCsscal()
 
 .. doxygenfunction:: clblasZdscal()
- 
+
 
 COPY - Copies elements from vector X to vector Y
 --------------------------------------------------
@@ -51,7 +51,7 @@ AXPY - Scale X and add to Y
 .. doxygenfunction:: clblasSaxpy()
 
 .. doxygenfunction:: clblasZaxpy()
- 
+
 
 
 DOT - Dot product of two vectors
@@ -73,8 +73,8 @@ ROTG - Constructs givens plane rotation
 .. doxygenfunction:: clblasCrotg()
 
 .. doxygenfunction:: clblasDrotg()
- 
-.. doxygenfunction:: clblasSrotg() 
+
+.. doxygenfunction:: clblasSrotg()
 
 .. doxygenfunction:: clblasZrotg()
 
@@ -105,14 +105,14 @@ ROTM - Apply modified givens rotation for points in the plane
 
 
 NRM2 - Euclidean norm of a vector
------------------------------------- 
+------------------------------------
 .. doxygenfunction:: clblasDnrm2()
 
 .. doxygenfunction:: clblasDznrm2()
 
 .. doxygenfunction:: clblasScnrm2()
 
-.. doxygenfunction:: clblasSnrm2() 
+.. doxygenfunction:: clblasSnrm2()
 
 iAMAX - Index of max absolute value
 ------------------------------------
@@ -126,7 +126,7 @@ iAMAX - Index of max absolute value
 
 
 ASUM - Sum of absolute values
------------------------------------- 
+------------------------------------
 .. doxygenfunction:: clblasDasum()
 
 .. doxygenfunction:: clblasDzasum()
diff --git a/ROCm_API_References/BLAS2.rst b/ROCm_API_References/BLAS2.rst
index 41662805..b4c699ac 100644
--- a/ROCm_API_References/BLAS2.rst
+++ b/ROCm_API_References/BLAS2.rst
@@ -21,14 +21,14 @@ SYMV - Symmetric matrix-Vector multiplication
 
 
 HEMV - Hermitian matrix-vector multiplication
---------------------------------------------- 
+---------------------------------------------
 .. doxygenfunction:: clblasChemv()
 
 .. doxygenfunction:: clblasZhemv()
 
 
 TRMV - Triangular matrix vector multiply
---------------------------------------------- 
+---------------------------------------------
 .. doxygenfunction:: clblasCtrmv()
 
 .. doxygenfunction:: clblasDtrmv()
@@ -39,7 +39,7 @@ TRMV - Triangular matrix vector multiply
 
 
 TRSV - Triangular matrix vector Solve
---------------------------------------------- 
+---------------------------------------------
 .. doxygenfunction:: clblasCtrsv()
 
 .. doxygenfunction:: clblasDtrsv()
@@ -49,7 +49,7 @@ TRSV - Triangular matrix vector Solve
 .. doxygenfunction:: clblasZtrsv()
 
 GER - General matrix rank 1 operation
---------------------------------------------- 
+---------------------------------------------
 .. doxygenfunction:: clblasDger()
 
 .. doxygenfunction:: clblasSger()
@@ -110,7 +110,7 @@ TPMV - Triangular packed matrix-vector multiply
 
 
 TPSV - Triangular packed matrix vector solve
---------------------------------------------- 
+---------------------------------------------
 .. doxygenfunction:: clblasCtpsv()
 
 .. doxygenfunction:: clblasStpsv()
@@ -143,9 +143,9 @@ SPR - Symmetric packed matrix rank 1 update
 .. doxygenfunction:: clblasSspr()
 
 
- 
+
 HPR - Hermitian packed matrix rank 1 update
---------------------------------------------- 
+---------------------------------------------
 .. doxygenfunction:: clblasChpr()
 
 .. doxygenfunction:: clblasZhpr()
@@ -153,7 +153,7 @@ HPR - Hermitian packed matrix rank 1 update
 
 
 SPR2 - Symmetric packed matrix rank 2 update
---------------------------------------------- 
+---------------------------------------------
 .. doxygenfunction:: clblasDspr2()
 
 
@@ -167,7 +167,7 @@ HPR2 - Hermitian packed matrix rank 2 update
 .. doxygenfunction:: clblasZhpr2()
 
 
- 
+
 GBMV - General banded matrix-vector multiplication
 ---------------------------------------------------
 .. doxygenfunction:: clblasCgbmv()
@@ -210,7 +210,7 @@ HBMV - Hermitian banded matrix-vector multiplication
 
 
 TBSV - Solving triangular banded matrix
---------------------------------------------- 
+---------------------------------------------
 .. doxygenfunction:: clblasCtbsv()
 
 
diff --git a/ROCm_API_References/BLAS3.rst b/ROCm_API_References/BLAS3.rst
index a78df278..7f3d7708 100644
--- a/ROCm_API_References/BLAS3.rst
+++ b/ROCm_API_References/BLAS3.rst
@@ -44,7 +44,7 @@ TRSM - Solving triangular systems of equations
 
 
 SYRK - Symmetric rank-k update of a matrix
--------------------------------------------- 
+--------------------------------------------
 .. doxygenfunction:: clblasCsyrk()
 
 
@@ -58,7 +58,7 @@ SYRK - Symmetric rank-k update of a matrix
 
 
 SYR2K - Symmetric rank-2k update to a matrix
--------------------------------------------- 
+--------------------------------------------
 .. doxygenfunction:: clblasSsyr2k()
 
 .. doxygenfunction:: clblasZsyr2k()
@@ -66,7 +66,7 @@ SYR2K - Symmetric rank-2k update to a matrix
 
 
 SYMM - Symmetric matrix-matrix multiply
--------------------------------------------- 
+--------------------------------------------
 .. doxygenfunction:: clblasCsymm()
 
 .. doxygenfunction:: clblasDsymm()
@@ -80,7 +80,7 @@ SYMM - Symmetric matrix-matrix multiply
 
 
 HEMM - Hermitian matrix-matrix multiplication
--------------------------------------------- 
+--------------------------------------------
 .. doxygenfunction:: clblasChemm()
 
 
@@ -89,7 +89,7 @@ HEMM - Hermitian matrix-matrix multiplication
 
 
 HERK - Hermitian rank-k update to a matrix
--------------------------------------------- 
+--------------------------------------------
 .. doxygenfunction:: clblasCherk()
 
 
@@ -100,7 +100,7 @@ HERK - Hermitian rank-k update to a matrix
 
 
 HER2K - Hermitian rank-2k update to a matrix
--------------------------------------------- 
+--------------------------------------------
 .. doxygenfunction:: clblasCher2k()
 
 
diff --git a/ROCm_API_References/HCC-API.rst b/ROCm_API_References/HCC-API.rst
index 13abbcab..e9024358 100644
--- a/ROCm_API_References/HCC-API.rst
+++ b/ROCm_API_References/HCC-API.rst
@@ -46,7 +46,7 @@ For example:
 
 ::
 
-   `` hcchcc-config –cxxflags –ldflagsfoo.cpp -o foo ``
+   `` hcchcc-config -cxxflags -ldflagsfoo.cpp -o foo ``
 
 HCC built-in macros
 ********************
@@ -143,4 +143,4 @@ HC supports capturing memory pointer by a GPU kernel.
 
 ``` // allocate GPU memory through the HSA API int* gpu_pointer; hsa_memory_allocate(..., &gpu_pointer); ... parallel_for_each(ext, [=](index i) [[hc]] { gpu_pointer[i[0]]++; }
 
-``` For HSA APUs that supports system wide shared virtual memory, a GPU kernel can directly access system memory allocated by the host: ``` int* cpu_memory = (int*) malloc(...); ... parallel_for_each(ext, [=](index i) [[hc]] { cpu_memory[i[0]]++; }); ```
\ No newline at end of file
+``` For HSA APUs that supports system wide shared virtual memory, a GPU kernel can directly access system memory allocated by the host: ``` int* cpu_memory = (int*) malloc(...); ... parallel_for_each(ext, [=](index i) [[hc]] { cpu_memory[i[0]]++; }); ```
diff --git a/ROCm_API_References/HIP-MATH.rst b/ROCm_API_References/HIP-MATH.rst
index bc20d3c6..1fc84001 100644
--- a/ROCm_API_References/HIP-MATH.rst
+++ b/ROCm_API_References/HIP-MATH.rst
@@ -1,6 +1,6 @@
 .. _HIP-MATH:
 
-HIP MATH APIs Documentation 
+HIP MATH APIs Documentation
 ############################
 HIP supports most of the device functions supported by CUDA. Way to find the unsupported one is to search for the function and check its description
 
@@ -9,14 +9,14 @@ HIP supports most of the device functions supported by CUDA. Way to find the uns
 For Developers
 
 If you add or fixed a device function, make sure to add a signature of the function and definition later.
-For example, if you want to add `__device__ float __dotf(float4, float4)`, which does a dot product on 4 float vector components 
-The way to add to the header is, 
+For example, if you want to add `__device__ float __dotf(float4, float4)`, which does a dot product on 4 float vector components
+The way to add to the header is,
 
-:: 
+::
 
-__device__ static float __dotf(float4, float4); 
+__device__ static float __dotf(float4, float4);
 /*Way down in the file....*/
-__device__ static inline float __dotf(float4 x, float4 y) { 
+__device__ static inline float __dotf(float4 x, float4 y) {
  /*implementation*/
 }
 
@@ -70,7 +70,7 @@ atan2f
 *********
 
 ::
- 
+
 __device__ float atan2f(float y, float x);
 
 **Description:** Supported
@@ -90,7 +90,7 @@ __device__ float atanf(float x);
 atanhf
 *********
 
-:: 
+::
 
  __device__ float atanhf(float x);
 
@@ -110,7 +110,7 @@ __device__ float cbrtf(float x);
 ceilf
 *********
 
-:: 
+::
 
 __device__ float ceilf(float x);
 
@@ -121,7 +121,7 @@ __device__ float ceilf(float x);
 copysignf
 *********
 
-:: 
+::
 
  __device__ float copysignf(float x, float y);
 
@@ -132,7 +132,7 @@ copysignf
 cosf
 *********
 
-:: 
+::
 
 __device__ float cosf(float x);
 
@@ -142,7 +142,7 @@ __device__ float cosf(float x);
 
 coshf
 *********
-:: 
+::
 
 __device__ float coshf(float x);
 
@@ -152,7 +152,7 @@ __device__ float coshf(float x);
 
 cospif
 *********
-:: 
+::
 
 __device__ float cospif(float x);
 
@@ -162,7 +162,7 @@ __device__ float cospif(float x);
 
 cyl_bessel_i0f
 *********
-:: 
+::
 
 //__device__ float cyl_bessel_i0f(float x);
 
@@ -172,7 +172,7 @@ cyl_bessel_i0f
 
 cyl_bessel_i1f
 *********
-:: 
+::
 
 //__device__ float cyl_bessel_i1f(float x);
 
@@ -181,8 +181,8 @@ cyl_bessel_i1f
 
 erfcf
 *********
- :: 
- 
+ ::
+
  __device__ float erfcf(float x);
 
 
@@ -191,7 +191,7 @@ erfcf
 
 erfcinvf
 *********
-:: 
+::
 
 __device__float erfcinvf(float y);
 
@@ -200,7 +200,7 @@ __device__float erfcinvf(float y);
 
 erfcxf
 *********
-:: 
+::
 
  __device__ float erfcxf(float x);
 
@@ -209,7 +209,7 @@ erfcxf
 
 erff
 *********
-:: 
+::
 
 __device__ float erff(float x);
 
@@ -219,7 +219,7 @@ __device__ float erff(float x);
 
 erfinvf
 *********
-:: 
+::
 
 __device__ float erfinvf(float y);
 
@@ -229,7 +229,7 @@ __device__ float erfinvf(float y);
 
 exp10f
 *********
-:: 
+::
 
 __device__ float exp10f(float x);
 
@@ -239,7 +239,7 @@ __device__ float exp10f(float x);
 
 exp2f
 *********
-:: 
+::
 
 _device__ float exp2f(float x);
 
@@ -250,7 +250,7 @@ _device__ float exp2f(float x);
 expf
 *********
 
-:: 
+::
 
 __device__ float expf(float x);
 
@@ -261,7 +261,7 @@ __device__ float expf(float x);
 expm1f
 *********
 
-:: 
+::
 
 __device__ float expm1f(float x);
 
@@ -272,7 +272,7 @@ __device__ float expm1f(float x);
 fabsf
 *********
 ::
- 
+
  __device__ float fabsf(float x);
 
 
@@ -281,7 +281,7 @@ fabsf
 
 fdimf
 *********
-:: 
+::
 
 __device__ float fdimf(float x, float y);
 
@@ -291,7 +291,7 @@ __device__ float fdimf(float x, float y);
 
 fdivide
 *********
-:: 
+::
 
 __device__ float fdividef(float x, float y);
 
@@ -301,7 +301,7 @@ __device__ float fdividef(float x, float y);
 
 floorf
 *********
-:: 
+::
 
 __device__ float floorf(float x);
 
@@ -311,7 +311,7 @@ __device__ float floorf(float x);
 
 fmaf
 *********
-:: 
+::
 
 __device__ float fmaf(float x, float y, float z);
 
@@ -321,7 +321,7 @@ __device__ float fmaf(float x, float y, float z);
 
 fmaxf
 *********
-:: 
+::
 
 __device__ float fmaxf(float x, float y);
 
@@ -331,7 +331,7 @@ __device__ float fmaxf(float x, float y);
 
 fminf
 *********
-:: 
+::
 
 __device__ float fminf(float x, float y);
 
@@ -341,7 +341,7 @@ __device__ float fminf(float x, float y);
 
 fmodf
 *********
-:: 
+::
 
 __device__ float fmodf(float x, float y);
 
@@ -351,8 +351,8 @@ __device__ float fmodf(float x, float y);
 
 frexpf
 *********
-:: 
- 
+::
+
 //__device__ float frexpf(float x, int* nptr);
 
 
@@ -361,7 +361,7 @@ frexpf
 
 hypotf
 *********
-:: 
+::
 
 __device__ float hypotf(float x, float y);
 
@@ -371,7 +371,7 @@ __device__ float hypotf(float x, float y);
 
 ilogbf
 *********
-:: 
+::
 
 __device__ float ilogbf(float x);
 
@@ -381,7 +381,7 @@ __device__ float ilogbf(float x);
 
 isfinite
 *********
-:: 
+::
 
 __device__ int isfinite(float a);
 
@@ -391,7 +391,7 @@ __device__ int isfinite(float a);
 
 isinf
 *********
-:: 
+::
 
  __device__ unsigned isinf(float a);
 
@@ -401,7 +401,7 @@ isinf
 
 isnan
 *********
-:: 
+::
 
  __device__ unsigned isnan(float a);
 
@@ -411,7 +411,7 @@ isnan
 
 j0f
 *********
-:: 
+::
 
 __device__ float j0f(float x);
 
@@ -421,7 +421,7 @@ __device__ float j0f(float x);
 
 j1f
 *********
-:: 
+::
 
  __device__ float j1f(float x);
 
@@ -431,7 +431,7 @@ j1f
 
 jnf
 *********
-:: 
+::
 
 __device__ float jnf(int n, float x);
 
@@ -440,7 +440,7 @@ __device__ float jnf(int n, float x);
 
 ldexpf
 *********
-:: 
+::
 
 __device__ float ldexpf(float x, int exp);
 
@@ -450,7 +450,7 @@ __device__ float ldexpf(float x, int exp);
 
 lgammaf
 *********
-:: 
+::
 
 //__device__ float lgammaf(float x);
 
@@ -460,7 +460,7 @@ lgammaf
 
 llrintf
 *********
-:: 
+::
 
 __device__ long long int llrintf(float x);
 
@@ -470,7 +470,7 @@ __device__ long long int llrintf(float x);
 
 llroundf
 *********
-:: 
+::
 
 __device__ long long int llroundf(float x);
 
@@ -480,7 +480,7 @@ __device__ long long int llroundf(float x);
 
 log10f
 *********
-:: 
+::
 
 __device__ float log10f(float x);
 
@@ -490,7 +490,7 @@ __device__ float log10f(float x);
 
 log1pf
 *********
-:: 
+::
 
 __device__ float log1pf(float x);
 
@@ -500,7 +500,7 @@ __device__ float log1pf(float x);
 
 logbf
 *********
-:: 
+::
 
 __device__ float logbf(float x);
 
@@ -510,7 +510,7 @@ __device__ float logbf(float x);
 
 lrintf
 *********
-:: 
+::
 
 __device__ long int lrintf(float x);
 
@@ -520,7 +520,7 @@ __device__ long int lrintf(float x);
 
 lroundf
 *********
-:: 
+::
 
 __device__ long int lroundf(float x);
 
@@ -530,7 +530,7 @@ __device__ long int lroundf(float x);
 
 modff
 *********
-:: 
+::
 
 //__device__ float modff(float x, float *iptr);
 
@@ -540,7 +540,7 @@ modff
 
 nanf
 *********
-:: 
+::
 
  __device__ float nanf(const char* tagp);
 
@@ -550,7 +550,7 @@ nanf
 
 nearbyintf
 *********
-:: 
+::
 
 __device__ float nearbyintf(float x);
 
@@ -560,7 +560,7 @@ __device__ float nearbyintf(float x);
 
 nextafterf
 *********
-:: 
+::
 
 //__device__ float nextafterf(float x, float y);
 
@@ -570,7 +570,7 @@ nextafterf
 
 norm3df
 *********
-:: 
+::
 
  __device__ float norm3df(float a, float b, float c);
 
@@ -580,7 +580,7 @@ norm3df
 
 norm4df
 *********
-:: 
+::
 
 __device__ float norm4df(float a, float b, float c, float d);
 
@@ -590,7 +590,7 @@ __device__ float norm4df(float a, float b, float c, float d);
 
 normcdff
 *********
-:: 
+::
 
 __device__ float normcdff(float y);
 
@@ -600,7 +600,7 @@ __device__ float normcdff(float y);
 
 normcdfinvf
 *********
-:: 
+::
 
  __device__ float normcdfinvf(float y);
 
@@ -610,7 +610,7 @@ normcdfinvf
 
 normf
 *********
-:: 
+::
 
 __device__ float normf(int dim, const float *a);
 
@@ -620,7 +620,7 @@ __device__ float normf(int dim, const float *a);
 
 powf
 *********
-:: 
+::
 
  __device__ float powf(float x, float y);
 
@@ -630,8 +630,8 @@ powf
 
 rcbrtf
 *********
-:: 
- 
+::
+
  __device__ float rcbrtf(float x);
 
 
@@ -640,7 +640,7 @@ rcbrtf
 
 remainderf
 *********
-:: 
+::
 
  __device__ float remainderf(float x, float y);
 
@@ -649,8 +649,8 @@ remainderf
 
 remquof
 *********
-:: 
- 
+::
+
  __device__ float remquof(float x, float y, int *quo);
 
 
@@ -659,7 +659,7 @@ remquof
 
 rhypotf
 *********
-:: 
+::
 
 __device__ float rhypotf(float x, float y);
 
@@ -669,7 +669,7 @@ __device__ float rhypotf(float x, float y);
 
 rintf
 *********
-:: 
+::
 
  __device__ float rintf(float x);
 
@@ -678,7 +678,7 @@ rintf
 
 rnorm3df
 *********
-:: 
+::
 
  __device__ float rnorm3df(float a, float b, float c);
 
@@ -688,7 +688,7 @@ rnorm3df
 
 rnorm4df
 *********
-:: 
+::
 
  __device__ float rnorm4df(float a, float b, float c, float d);
 
@@ -698,7 +698,7 @@ rnorm4df
 
 rnormf
 *********
-:: 
+::
 
 __device__ float rnormf(int dim, const float* a);
 
@@ -708,7 +708,7 @@ __device__ float rnormf(int dim, const float* a);
 
 roundf
 *********
-:: 
+::
 
  __device__ float roundf(float x);
 
@@ -718,7 +718,7 @@ roundf
 
 rsqrtf
 *********
-:: 
+::
 
  __device__ float rsqrtf(float x);
 
@@ -728,7 +728,7 @@ rsqrtf
 
 scalblnf
 *********
-:: 
+::
 
  __device__ float scalblnf(float x, long int n);
 
@@ -738,7 +738,7 @@ scalblnf
 
 scalbnf
 *********
-:: 
+::
 
  __device__ float scalbnf(float x, int n);
 
@@ -748,7 +748,7 @@ scalbnf
 
 signbit
 *********
-:: 
+::
 
  __device__ int signbit(float a);
 
@@ -757,7 +757,7 @@ signbit
 
 sincosf
 *********
-:: 
+::
 
  __device__ void sincosf(float x, float *sptr, float *cptr);
 
@@ -767,7 +767,7 @@ sincosf
 
 sincospif
 *********
-:: 
+::
 
 __device__ void sincospif(float x, float *sptr, float *cptr);
 
@@ -777,7 +777,7 @@ __device__ void sincospif(float x, float *sptr, float *cptr);
 
 sinf
 *********
-:: 
+::
 
 __device__ float sinf(float x);
 
@@ -787,7 +787,7 @@ __device__ float sinf(float x);
 
 sinhf
 *********
-:: 
+::
 
 __device__ float sinhf(float x);
 
@@ -797,7 +797,7 @@ __device__ float sinhf(float x);
 
 sinpif
 *********
-:: 
+::
 
 __device__ float sinpif(float x);
 
@@ -807,8 +807,8 @@ __device__ float sinpif(float x);
 
 sqrtf
 *********
-:: 
- 
+::
+
 __device__ float sqrtf(float x);
 
 **Description:** Supported
@@ -817,7 +817,7 @@ __device__ float sqrtf(float x);
 tanf
 *********
 
-:: 
+::
 
    __device__ float tanf(float x);
 
@@ -826,8 +826,8 @@ tanf
 
 
 tanhf
-********* 
- :: 
+*********
+ ::
 
     __device__ float tanhf(float x);
 
@@ -837,7 +837,7 @@ tanhf
 
 tgammaf
 *********
-:: 
+::
 
   __device__ float tgammaf(float x);
 
@@ -847,8 +847,8 @@ tgammaf
 
 truncf
 *********
-:: 
- 
+::
+
  __device__ float truncf(float x);
 
 
@@ -857,7 +857,7 @@ truncf
 
 y0f
 *********
-:: 
+::
 
 __device__ float y0f(float x);
 
@@ -867,7 +867,7 @@ __device__ float y0f(float x);
 
 y1f
 *********
-:: 
+::
 
 __device__ float y1f(float x);
 
@@ -876,7 +876,7 @@ __device__ float y1f(float x);
 
 ynf
 *********
-:: 
+::
 
  __device__ float ynf(int n, float x);
 
@@ -886,7 +886,7 @@ ynf
 
 acos
 *********
-:: 
+::
 
  __device__ double acos(double x);
 
@@ -896,7 +896,7 @@ acos
 
 acosh
 *********
-:: 
+::
 
 __device__ double acosh(double x);
 
@@ -906,7 +906,7 @@ __device__ double acosh(double x);
 
 asin
 *********
-:: 
+::
 
    __device__ double asin(double x);
 
@@ -916,7 +916,7 @@ asin
 
 asinh
 *********
-:: 
+::
 
   __device__ double asinh(double x);
 
@@ -926,8 +926,8 @@ asinh
 
 atan
 *********
-:: 
-   
+::
+
    __device__ double atan(double x);
 
 
@@ -936,8 +936,8 @@ atan
 
 atan2
 *********
-:: 
- 
+::
+
   __device__ double atan2(double y, double x);
 
 
@@ -946,7 +946,7 @@ atan2
 
 atanh
 *********
-:: 
+::
 
    __device__ double atanh(double x);
 
@@ -956,8 +956,8 @@ atanh
 
 cbrt
 *********
-:: 
-  
+::
+
    __device__ double cbrt(double x);
 
 
@@ -967,7 +967,7 @@ cbrt
 ceil
 *********
 ::
- 
+
    __device__ double ceil(double x);
 
 
@@ -976,7 +976,7 @@ ceil
 
 copysign
 *********
-:: 
+::
 
    __device__ double copysign(double x, double y);
 
@@ -985,7 +985,7 @@ copysign
 
 cos
 *********
-:: 
+::
 
    __device__ double cos(double x);
 
@@ -995,7 +995,7 @@ cos
 
 cosh
 *********
-:: 
+::
 
    __device__ double cosh(double x);
 
@@ -1005,7 +1005,7 @@ cosh
 
 cospi
 *********
-:: 
+::
 
   __device__ double cospi(double x);
 
@@ -1015,7 +1015,7 @@ cospi
 
 cyl_bessel_i0
 ******************
-:: 
+::
 
    //__device__ double cyl_bessel_i0(double x);
 
@@ -1025,7 +1025,7 @@ cyl_bessel_i0
 
 cyl_bessel_i1
 ******************
-:: 
+::
 
    //__device__ double cyl_bessel_i1(double x);
 
@@ -1035,8 +1035,8 @@ cyl_bessel_i1
 
 erf
 *********
-:: 
- 
+::
+
     __device__ double erf(double x);
 
 
@@ -1046,7 +1046,7 @@ erf
 erfc
 *********
 ::
- 
+
    __device__ double erfc(double x);
 
 
@@ -1055,7 +1055,7 @@ erfc
 
 erfcinv
 *********
-:: 
+::
 
    __device__ double erfcinv(double y);
 
@@ -1065,7 +1065,7 @@ erfcinv
 
 erfcx
 *********
-:: 
+::
 
    __device__ double erfcx(double x);
 
@@ -1075,7 +1075,7 @@ erfcx
 
 erfinv
 *********
-:: 
+::
 
    __device__ double erfinv(double x);
 
@@ -1085,7 +1085,7 @@ erfinv
 
 exp
 *********
-:: 
+::
 
    __device__ double exp(double x);
 
@@ -1095,7 +1095,7 @@ exp
 
 exp10
 *********
-:: 
+::
 
    __device__ double exp10(double x);
 
@@ -1105,7 +1105,7 @@ exp10
 
 exp2
 *********
-:: 
+::
 
    __device__ double exp2(double x);
 
@@ -1115,7 +1115,7 @@ exp2
 
 expm1
 *********
-:: 
+::
 
    __device__ double expm1(double x);
 
@@ -1125,7 +1125,7 @@ expm1
 
 fabs
 *********
-:: 
+::
 
    __device__ double fabs(double x);
 
@@ -1135,7 +1135,7 @@ fabs
 
 fdim
 *********
-:: 
+::
 
    __device__ double fdim(double x, double y);
 
@@ -1145,7 +1145,7 @@ fdim
 
 floor
 *********
-:: 
+::
 
    __device__ double floor(double x);
 
@@ -1155,7 +1155,7 @@ floor
 
 fma
 *********
-:: 
+::
 
    __device__ double fma(double x, double y, double z);
 
@@ -1165,7 +1165,7 @@ fma
 
 fmax
 *********
-:: 
+::
 
    __device__ double fmax(double x, double y);
 
@@ -1175,7 +1175,7 @@ fmax
 
 fmin
 *********
-:: 
+::
 
    __device__ double fmin(double x, double y);
 
@@ -1186,16 +1186,16 @@ fmin
 fmod
 *********
 ::
- 
+
    __device__ double fmod(double x, double y);
- 
+
 
 **Description:** Supported
 
 
 frexp
 *********
-:: 
+::
 
    //__device__ double frexp(double x, int *nptr);
 
@@ -1205,7 +1205,7 @@ frexp
 
 hypot
 *********
-:: 
+::
 
    __device__ double hypot(double x, double y);
 
@@ -1215,7 +1215,7 @@ hypot
 
 ilogb
 *********
-:: 
+::
 
    __device__ double ilogb(double x);
 
@@ -1226,7 +1226,7 @@ ilogb
 isfinite
 *********
 ::
- 
+
    __device__ int isfinite(double x);
 
 
@@ -1235,7 +1235,7 @@ isfinite
 
 isinf
 *********
-:: 
+::
 
    __device__ unsigned isinf(double x);
 
@@ -1245,7 +1245,7 @@ isinf
 
 isnan
 *********
-:: 
+::
 
    __device__ unsigned isnan(double x);
 
@@ -1256,7 +1256,7 @@ isnan
 j0
 *********
 ::
- 
+
    __device__ double j0(double x);
 
 
@@ -1265,7 +1265,7 @@ j0
 
 j1
 *********
-:: 
+::
 
    __device__ double j1(double x);
 
@@ -1275,7 +1275,7 @@ j1
 
 jn
 *********
-:: 
+::
 
   __device__ double jn(int n, double x);
 
@@ -1285,7 +1285,7 @@ jn
 
 ldexp
 *********
-:: 
+::
 
   __device__ double ldexp(double x, int exp);
 
@@ -1295,7 +1295,7 @@ ldexp
 
 lgamma
 *********
-:: 
+::
 
   __device__ double lgamma(double x);
 
@@ -1305,7 +1305,7 @@ lgamma
 
 llrint
 *********
-:: 
+::
 
    __device__ long long llrint(double x);
 
@@ -1315,7 +1315,7 @@ llrint
 
 llround
 *********
-:: 
+::
 
   __device__ long long llround(double x);
 
@@ -1325,7 +1325,7 @@ llround
 
 log
 *********
-:: 
+::
 
    __device__ double log(double x);
 
@@ -1336,16 +1336,16 @@ log
 log10
 *********
 ::
- 
+
    __device__ double log10(double x);
- 
+
 
 **Description:** Supported
 
 
 log1p
 *********
-:: 
+::
 
   __device__ double log1p(double x);
 
@@ -1355,7 +1355,7 @@ log1p
 
 log2
 *********
-:: 
+::
 
    __device__ double log2(double x);
 
@@ -1365,7 +1365,7 @@ log2
 
 logb
 *********
-:: 
+::
 
    __device__ double logb(double x);
 
@@ -1375,7 +1375,7 @@ logb
 
 lrint
 *********
-:: 
+::
 
    __device__ long int lrint(double x);
 
@@ -1385,7 +1385,7 @@ lrint
 
 lround
 *********
-:: 
+::
 
    __device__ long int lround(double x);
 
@@ -1395,7 +1395,7 @@ lround
 
 modf
 *********
-:: 
+::
 
    //__device__ double modf(double x, double *iptr);
 
@@ -1405,7 +1405,7 @@ modf
 
 nan
 *********
-:: 
+::
 
    __device__ double nan(const char* tagp);
 
@@ -1415,7 +1415,7 @@ nan
 
 nearbyint
 *********
-:: 
+::
 
    __device__ double nearbyint(double x);
 
@@ -1425,7 +1425,7 @@ nearbyint
 
 nextafter
 *********
-:: 
+::
 
   __device__ double nextafter(double x, double y);
 
@@ -1435,7 +1435,7 @@ nextafter
 
 norm
 *********
-:: 
+::
 
    __device__ double norm(int dim, const double* t);
 
@@ -1445,7 +1445,7 @@ norm
 
 norm3d
 *********
-:: 
+::
 
    __device__ double norm3d(double a, double b, double c);
 
@@ -1455,7 +1455,7 @@ norm3d
 
 norm4d
 *********
-:: 
+::
 
   __device__ double norm4d(double a, double b, double c, double d);
 
@@ -1465,7 +1465,7 @@ norm4d
 
 normcdf
 *********
-:: 
+::
 
    __device__ double normcdf(double y);
 
@@ -1475,7 +1475,7 @@ normcdf
 
 normcdfinv
 *********
-:: 
+::
 
    __device__ double normcdfinv(double y);
 
@@ -1485,7 +1485,7 @@ normcdfinv
 
 pow
 *********
-:: 
+::
 
    __device__ double pow(double x, double y);
 
@@ -1495,7 +1495,7 @@ pow
 
 rcbrt
 *********
-:: 
+::
 
    __device__ double rcbrt(double x);
 
@@ -1505,7 +1505,7 @@ rcbrt
 
 remainder
 *********
-:: 
+::
 
    __device__ double remainder(double x, double y);
 
@@ -1515,7 +1515,7 @@ remainder
 
 remquo
 *********
-:: 
+::
 
   //__device__ double remquo(double x, double y, int *quo);
 
@@ -1525,7 +1525,7 @@ remquo
 
 rhypot
 *********
-:: 
+::
 
    __device__ double rhypot(double x, double y);
 
@@ -1535,7 +1535,7 @@ rhypot
 
 rint
 *********
-:: 
+::
 
    __device__ double rint(double x);
 
@@ -1545,7 +1545,7 @@ rint
 
 rnorm
 *********
-:: 
+::
 
    __device__ double rnorm(int dim, const double* t);
 
@@ -1555,7 +1555,7 @@ rnorm
 
 rnorm3d
 *********
-:: 
+::
 
    __device__ double rnorm3d(double a, double b, double c);
 
@@ -1566,7 +1566,7 @@ rnorm3d
 rnorm4d
 *********
 ::
- 
+
    __device__ double rnorm4d(double a, double b, double c, double d);
 
 
@@ -1575,7 +1575,7 @@ rnorm4d
 
 round
 *********
-:: 
+::
 
   __device__ double round(double x);
 
@@ -1585,7 +1585,7 @@ round
 
 rsqrt
 *********
-:: 
+::
 
   __device__ double rsqrt(double x);
 
@@ -1595,7 +1595,7 @@ rsqrt
 
 scalbln
 *********
-:: 
+::
 
   __device__ double scalbln(double x, long int n);
 
@@ -1605,7 +1605,7 @@ scalbln
 
 scalbn
 *********
-:: 
+::
 
   __device__ double scalbn(double x, int n);
 
@@ -1615,7 +1615,7 @@ scalbn
 
 signbit
 *********
-:: 
+::
 
   __device__ int signbit(double a);
 
@@ -1625,7 +1625,7 @@ signbit
 
 sin
 *********
-:: 
+::
 
    __device__ double sin(double a);
 
@@ -1635,7 +1635,7 @@ sin
 
 sincos
 *********
-:: 
+::
 
    __device__ void sincos(double x, double *sptr, double *cptr);
 
@@ -1645,7 +1645,7 @@ sincos
 
 sincospi
 *********
-:: 
+::
 
   __device__ void sincospi(double x, double *sptr, double *cptr);
 
@@ -1655,7 +1655,7 @@ sincospi
 
 sinh
 *********
-:: 
+::
 
   __device__ double sinh(double x);
 
@@ -1665,7 +1665,7 @@ sinh
 
 sinpi
 *********
-:: 
+::
 
   __device__ double sinpi(double x);
 
@@ -1675,7 +1675,7 @@ sinpi
 
 sqrt
 *********
-:: 
+::
 
   __device__ double sqrt(double x);
 
@@ -1685,7 +1685,7 @@ sqrt
 
 tan
 *********
-:: 
+::
 
   __device__ double tan(double x);
 
@@ -1695,7 +1695,7 @@ tan
 
 tanh
 *********
-:: 
+::
 
   __device__ double tanh(double x);
 
@@ -1705,7 +1705,7 @@ tanh
 
 tgamma
 *********
-:: 
+::
 
   __device__ double tgamma(double x);
 
@@ -1715,7 +1715,7 @@ tgamma
 
 trunc
 *********
-:: 
+::
 
    __device__ double trunc(double x);
 
@@ -1725,7 +1725,7 @@ trunc
 
 y0
 *********
-:: 
+::
 
   __device__ double y0(double x);
 
@@ -1735,7 +1735,7 @@ y0
 
 y1
 *********
-:: 
+::
 
   __device__ double y1(double y);
 
@@ -1745,7 +1745,7 @@ y1
 
 yn
 *********
-:: 
+::
 
   __device__ double yn(int n, double x);
 
@@ -1755,7 +1755,7 @@ yn
 
 __cosf
 *********
-:: 
+::
 
   __device__float __cosf(float x);
 
@@ -1765,7 +1765,7 @@ __cosf
 
 __exp10f
 *********
-:: 
+::
 
   __device__float __exp10f(float x);
 
@@ -1775,7 +1775,7 @@ __exp10f
 
 __expf
 *********
-:: 
+::
 
   __device__float __expf(float x);
 
@@ -1785,7 +1785,7 @@ __expf
 
 __fadd_rd
 *********
-:: 
+::
 
   __device__ staticfloat __fadd_rd(float x, float y);
 
@@ -1795,7 +1795,7 @@ __fadd_rd
 
 __fadd_rn
 *********
-:: 
+::
 
    __device__ staticfloat __fadd_rn(float x, float y);
 
@@ -1805,7 +1805,7 @@ __fadd_rn
 
 __fadd_ru
 *********
-:: 
+::
 
    __device__ staticfloat __fadd_ru(float x, float y);
 
@@ -1815,7 +1815,7 @@ __fadd_ru
 
 __fadd_rz
 *********
-:: 
+::
 
   __device__ staticfloat __fadd_rz(float x, float y);
 
@@ -1825,7 +1825,7 @@ __fadd_rz
 
 __fdiv_rd
 *********
-:: 
+::
 
    __device__ staticfloat __fdiv_rd(float x, float y);
 
@@ -1835,7 +1835,7 @@ __fdiv_rd
 
 __fdiv_rn
 *********
-:: 
+::
 
   __device__ staticfloat __fdiv_rn(float x, float y);
 
@@ -1845,7 +1845,7 @@ __fdiv_rn
 
 __fdiv_ru
 *********
-:: 
+::
 
   __device__ staticfloat __fdiv_ru(float x, float y);
 
@@ -1855,7 +1855,7 @@ __fdiv_ru
 
 __fdiv_rz
 *********
-:: 
+::
 
    __device__ staticfloat __fdiv_rz(float x, float y);
 
@@ -1865,7 +1865,7 @@ __fdiv_rz
 
 __fdividef
 *********
-:: 
+::
 
    __device__ staticfloat __fdividef(float x, float y);
 
@@ -1875,7 +1875,7 @@ __fdividef
 
 __fmaf_rd
 *********
-:: 
+::
 
    __device__float __fmaf_rd(float x, float y, float z);
 
@@ -1885,7 +1885,7 @@ __fmaf_rd
 
 __fmaf_rn
 *********
-:: 
+::
 
    __device__float __fmaf_rn(float x, float y, float z);
 
@@ -1895,7 +1895,7 @@ __fmaf_rn
 
 __fmaf_ru
 *********
-:: 
+::
 
   __device__float __fmaf_ru(float x, float y, float z);
 
@@ -1905,7 +1905,7 @@ __fmaf_ru
 
 __fmaf_rz
 *********
-:: 
+::
 
    __device__float __fmaf_rz(float x, float y, float z);
 
@@ -1915,7 +1915,7 @@ __fmaf_rz
 
 __fmul_rd
 *********
-:: 
+::
 
    __device__ staticfloat __fmul_rd(float x, float y);
 
@@ -1925,7 +1925,7 @@ __fmul_rd
 
 __fmul_rn
 *********
-:: 
+::
 
    __device__ staticfloat __fmul_rn(float x, float y);
 
@@ -1935,7 +1935,7 @@ __fmul_rn
 
 __fmul_ru
 *********
-:: 
+::
 
    __device__ staticfloat __fmul_ru(float x, float y);
 
@@ -1945,7 +1945,7 @@ __fmul_ru
 
 __fmul_rz
 *********
-:: 
+::
 
    __device__ staticfloat __fmul_rz(float x, float y);
 
@@ -1955,7 +1955,7 @@ __fmul_rz
 
 __frcp_rd
 *********
-:: 
+::
 
    __device__float __frcp_rd(float x);
 
@@ -1965,7 +1965,7 @@ __frcp_rd
 
 __frcp_rn
 *********
-:: 
+::
 
     __device__float __frcp_rn(float x);
 
@@ -1975,7 +1975,7 @@ __frcp_rn
 
 __frcp_ru
 *********
-:: 
+::
 
    __device__float __frcp_ru(float x);
 
@@ -1985,7 +1985,7 @@ __frcp_ru
 
 __frcp_rz
 *********
-:: 
+::
 
    __device__float __frcp_rz(float x);
 
@@ -1995,7 +1995,7 @@ __frcp_rz
 
 __frsqrt_rn
 ******************
-:: 
+::
 
    __device__float __frsqrt_rn(float x);
 
@@ -2005,7 +2005,7 @@ __frsqrt_rn
 
 __fsqrt_rd
 ******************
-:: 
+::
 
    __device__float __fsqrt_rd(float x);
 
@@ -2014,7 +2014,7 @@ __fsqrt_rd
 
 
 __fsqrt_rn
-:: 
+::
 __device__float __fsqrt_rn(float x);
 
 
@@ -2023,7 +2023,7 @@ __device__float __fsqrt_rn(float x);
 
 __fsqrt_ru
 *********
-:: 
+::
 
    __device__float __fsqrt_ru(float x);
 
@@ -2033,7 +2033,7 @@ __fsqrt_ru
 
 __fsqrt_rz
 *********
-:: 
+::
 
     __device__float __fsqrt_rz(float x);
 
@@ -2043,7 +2043,7 @@ __fsqrt_rz
 
 __fsub_rd
 *********
-:: 
+::
 
     __device__ staticfloat __fsub_rd(float x, float y);
 
@@ -2053,7 +2053,7 @@ __fsub_rd
 
 __fsub_rn
 *********
-:: 
+::
 
     __device__ staticfloat __fsub_rn(float x, float y);
 
@@ -2063,7 +2063,7 @@ __fsub_rn
 
 __fsub_ru
 *********
-:: 
+::
 
     __device__ staticfloat __fsub_ru(float x, float y);
 
@@ -2073,7 +2073,7 @@ __fsub_ru
 
 __log10f
 *********
-:: 
+::
 
      __device__float __log10f(float x);
 
@@ -2083,7 +2083,7 @@ __log10f
 
 __log2f
 *********
-:: 
+::
 
    __device__float __log2f(float x);
 
@@ -2093,7 +2093,7 @@ __log2f
 
 __logf
 *********
-:: 
+::
 
    __device__float __logf(float x);
 
@@ -2103,7 +2103,7 @@ __logf
 
 __powf
 *********
-:: 
+::
 
     __device__float __powf(float base, float exponent);
 
@@ -2113,8 +2113,8 @@ __powf
 
 __saturatef
 *********
-:: 
-   
+::
+
     __device__ staticfloat __saturatef(float x);
 
 
@@ -2123,7 +2123,7 @@ __saturatef
 
 __sincosf
 *********
-:: 
+::
 
    __device__void __sincosf(float x, float *s, float *c);
 
@@ -2133,7 +2133,7 @@ __sincosf
 
 __sinf
 *********
-:: 
+::
 
    __device__float __sinf(float x);
 
@@ -2143,7 +2143,7 @@ __sinf
 
 __tanf
 *********
-:: 
+::
 
    __device__float __tanf(float x);
 
@@ -2153,7 +2153,7 @@ __tanf
 
 __dadd_rd
 *********
-:: 
+::
 
    __device__ staticdouble __dadd_rd(double x, double y);
 
@@ -2163,7 +2163,7 @@ __dadd_rd
 
 __dadd_rn
 *********
-:: 
+::
 
    __device__ staticdouble __dadd_rn(double x, double y);
 
@@ -2173,8 +2173,8 @@ __dadd_rn
 
 __dadd_ru
 *********
-:: 
- 
+::
+
     __device__ staticdouble __dadd_ru(double x, double y);
 
 
@@ -2183,7 +2183,7 @@ __dadd_ru
 
 __dadd_rz
 *********
-:: 
+::
 
     __device__ staticdouble __dadd_rz(double x, double y);
 
@@ -2193,7 +2193,7 @@ __dadd_rz
 
 __ddiv_rd
 *********
-:: 
+::
 
    __device__ staticdouble __ddiv_rd(double x, double y);
 
@@ -2203,7 +2203,7 @@ __ddiv_rd
 
 __ddiv_rn
 *********
-:: 
+::
 
    __device__ staticdouble __ddiv_rn(double x, double y);
 
@@ -2213,7 +2213,7 @@ __ddiv_rn
 
 __ddiv_ru
 *********
-:: 
+::
 
   __device__ staticdouble __ddiv_ru(double x, double y);
 
@@ -2223,7 +2223,7 @@ __ddiv_ru
 
 __ddiv_rz
 *********
-:: 
+::
 
    __device__ staticdouble __ddiv_rz(double x, double y);
 
@@ -2233,7 +2233,7 @@ __ddiv_rz
 
 __dmul_rd
 *********
-:: 
+::
 
    __device__ staticdouble __dmul_rd(double x, double y);
 
@@ -2244,7 +2244,7 @@ __dmul_rd
 __dmul_rn
 *********
 ::
- 
+
    __device__ staticdouble __dmul_rn(double x, double y);
 
 
@@ -2254,7 +2254,7 @@ __dmul_rn
 __dmul_ru
 *********
 ::
- 
+
    __device__ staticdouble __dmul_ru(double x, double y);
 
 
@@ -2264,7 +2264,7 @@ __dmul_ru
 __dmul_rz
 *********
 ::
- 
+
    __device__ staticdouble __dmul_rz(double x, double y);
 
 
@@ -2273,7 +2273,7 @@ __dmul_rz
 
 __drcp_rd
 *********
-:: 
+::
 
    __device__double __drcp_rd(double x);
 
@@ -2283,7 +2283,7 @@ __drcp_rd
 
 __drcp_rn
 *********
-:: 
+::
 
    __device__double __drcp_rn(double x);
 
@@ -2293,8 +2293,8 @@ __drcp_rn
 
 __drcp_ru
 *********
-:: 
- 
+::
+
    __device__double __drcp_ru(double x);
 
 
@@ -2303,7 +2303,7 @@ __drcp_ru
 
 __drcp_rz
 *********
-:: 
+::
 
    __device__double __drcp_rz(double x);
 
@@ -2313,7 +2313,7 @@ __drcp_rz
 
 __dsqrt_rd
 *********
-:: 
+::
 
    __device__double __dsqrt_rd(double x);
 
@@ -2323,7 +2323,7 @@ __dsqrt_rd
 
 __dsqrt_rn
 *********
-:: 
+::
 
    __device__double __dsqrt_rn(double x);
 
@@ -2333,7 +2333,7 @@ __dsqrt_rn
 
 __dsqrt_ru
 *********
-:: 
+::
 
   __device__double __dsqrt_ru(double x);
 
@@ -2343,7 +2343,7 @@ __dsqrt_ru
 
 __dsqrt_rz
 *********
-:: 
+::
 
    __device__double __dsqrt_rz(double x);
 
@@ -2353,7 +2353,7 @@ __dsqrt_rz
 
 __dsub_rd
 *********
-:: 
+::
 
    __device__ staticdouble __dsub_rd(double x, double y);
 
@@ -2364,7 +2364,7 @@ __dsub_rd
 __dsub_rn
 *********
 
-:: 
+::
 
    __device__ staticdouble __dsub_rn(double x, double y);
 
@@ -2374,7 +2374,7 @@ __dsub_rn
 
 __dsub_ru
 *********
-:: 
+::
 
    __device__ staticdouble __dsub_ru(double x, double y);
 
@@ -2384,7 +2384,7 @@ __dsub_ru
 
 __dsub_rz
 *********
-:: 
+::
 
    __device__ staticdouble __dsub_rz(double x, double y);
 
@@ -2394,7 +2394,7 @@ __dsub_rz
 
 __fma_rd
 *********
-:: 
+::
 
     __device__double __fma_rd(double x, double y, double z);
 
@@ -2404,7 +2404,7 @@ __fma_rd
 
 __fma_rn
 *********
-:: 
+::
 
     __device__double __fma_rn(double x, double y, double z);
 
@@ -2414,7 +2414,7 @@ __fma_rn
 
 __fma_ru
 *********
-:: 
+::
 
    __device__double __fma_ru(double x, double y, double z);
 
@@ -2424,7 +2424,7 @@ __fma_ru
 
 __fma_rz
 *********
-:: 
+::
 
    __device__double __fma_rz(double x, double y, double z);
 
@@ -2434,7 +2434,7 @@ __fma_rz
 
 __brev
 *********
-:: 
+::
 
    __device__ unsigned int __brev( unsigned int x);
 
@@ -2444,7 +2444,7 @@ __brev
 
 __brevll
 *********
-:: 
+::
 
    __device__ unsigned long long int __brevll( unsigned long long int x);
 
@@ -2454,7 +2454,7 @@ __brevll
 
 __byte_perm
 *********
-:: 
+::
 
    __device__ unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
 
@@ -2464,7 +2464,7 @@ __byte_perm
 
 __clz
 *********
-:: 
+::
 
    __device__ unsigned int __clz(int x);
 
@@ -2474,8 +2474,8 @@ __clz
 
 __clzll
 *********
-:: 
- 
+::
+
    __device__ unsigned int __clzll(long long int x);
 
 
@@ -2484,7 +2484,7 @@ __clzll
 
 __ffs
 *********
-:: 
+::
 
    __device__ unsigned int __ffs(int x);
 
@@ -2494,7 +2494,7 @@ __ffs
 
 __ffsll
 *********
-:: 
+::
 
     __device__ unsigned int __ffsll(long long int x);
 
@@ -2504,7 +2504,7 @@ __ffsll
 
 __hadd
 *********
-:: 
+::
 
    __device__ static unsigned int __hadd(int x, int y);
 
@@ -2514,7 +2514,7 @@ __hadd
 
 __mul24
 *********
-:: 
+::
 
    __device__ static int __mul24(int x, int y);
 
@@ -2524,7 +2524,7 @@ __mul24
 
 __mul64hi
 *********
-:: 
+::
 
     __device__ long long int __mul64hi(long long int x, long long int y);
 
@@ -2534,7 +2534,7 @@ __mul64hi
 
 __mulhi
 *********
-:: 
+::
 
    __device__ static int __mulhi(int x, int y);
 
@@ -2544,7 +2544,7 @@ __mulhi
 
 __popc
 *********
-:: 
+::
 
    __device__ unsigned int __popc(unsigned int x);
 
@@ -2554,7 +2554,7 @@ __popc
 
 __popcll
 *********
-:: 
+::
 
    __device__ unsigned int __popcll(unsigned long long int x);
 
@@ -2564,7 +2564,7 @@ __popcll
 
 __rhadd
 *********
-:: 
+::
 
    __device__ static int __rhadd(int x, int y);
 
@@ -2574,7 +2574,7 @@ __rhadd
 
 __sad
 *********
-:: 
+::
 
    __device__ static unsigned int __sad(int x, int y, int z);
 
@@ -2584,7 +2584,7 @@ __sad
 
 __uhadd
 *********
-:: 
+::
 
    __device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
 
@@ -2594,7 +2594,7 @@ __uhadd
 
 __umul24
 *********
-:: 
+::
 
   __device__ static int __umul24(unsigned int x, unsigned int y);
 
@@ -2605,7 +2605,7 @@ __umul24
 __umul64hi
 *********
 
-:: 
+::
 
    __device__ unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
 
@@ -2615,7 +2615,7 @@ __umul64hi
 
 __umulhi
 *********
-:: 
+::
 
    __device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
 
@@ -2625,7 +2625,7 @@ __umulhi
 
 __urhadd
 *********
-:: 
+::
 
     __device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
 
@@ -2635,7 +2635,7 @@ __urhadd
 
 __usad
 *********
-:: 
+::
 
    __device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
 
@@ -2645,7 +2645,7 @@ __usad
 
 __double2float_rd
 ******************
-:: 
+::
 
    __device__ float __double2float_rd(double x);
 
@@ -2655,7 +2655,7 @@ __double2float_rd
 
 __double2float_rn
 ******************
-:: 
+::
 
     __device__ float __double2float_rn(double x);
 
@@ -2665,7 +2665,7 @@ __double2float_rn
 
 __double2float_ru
 ******************
-:: 
+::
 
     __device__ float __double2float_ru(double x);
 
@@ -2675,7 +2675,7 @@ __double2float_ru
 
 __double2float_rz
 ******************
-:: 
+::
 
     __device__ float __double2float_rz(double x);
 
@@ -2685,7 +2685,7 @@ __double2float_rz
 
 __double2hiint
 ******************
-:: 
+::
 
    __device__ int __double2hiint(double x);
 
@@ -2695,7 +2695,7 @@ __double2hiint
 
 __double2int_rd
 ******************
-:: 
+::
 
    __device__ int __double2int_rd(double x);
 
@@ -2705,7 +2705,7 @@ __double2int_rd
 
 __double2int_rn
 ******************
-:: 
+::
 
   __device__ int __double2int_rn(double x);
 
@@ -2715,7 +2715,7 @@ __double2int_rn
 
 __double2int_ru
 ******************
-:: 
+::
 
    __device__ int __double2int_ru(double x);
 
@@ -2725,7 +2725,7 @@ __double2int_ru
 
 __double2int_rz
 ******************
-:: 
+::
 
    __device__ int __double2int_rz(double x);
 
@@ -2735,7 +2735,7 @@ __double2int_rz
 
 __double2ll_rd
 ******************
-:: 
+::
 
    __device__ long long int __double2ll_rd(double x);
 
@@ -2745,7 +2745,7 @@ __double2ll_rd
 
 __double2ll_rn
 ******************
-:: 
+::
 
    __device__ long long int __double2ll_rn(double x);
 
@@ -2756,7 +2756,7 @@ __double2ll_rn
 __double2ll_ru
 ******************
 
-:: 
+::
 
    __device__ long long int __double2ll_ru(double x);
 
@@ -2766,8 +2766,8 @@ __double2ll_ru
 
 __double2ll_rz
 ******************
-:: 
- 
+::
+
    __device__ long long int __double2ll_rz(double x);
 
 
@@ -2776,7 +2776,7 @@ __double2ll_rz
 
 __double2loint
 ******************
-:: 
+::
 
    __device__ int __double2loint(double x);
 
@@ -2786,8 +2786,8 @@ __double2loint
 
 __double2uint_rd
 ******************
-:: 
- 
+::
+
     __device__ unsigned int __double2uint_rd(double x);
 
 
@@ -2796,7 +2796,7 @@ __double2uint_rd
 
 __double2uint_rn
 ******************
-:: 
+::
 
    __device__ unsigned int __double2uint_rn(double x);
 
@@ -2806,8 +2806,8 @@ __double2uint_rn
 
 __double2uint_ru
 ******************
-:: 
-  
+::
+
    __device__ unsigned int __double2uint_ru(double x);
 
 
@@ -2816,7 +2816,7 @@ __double2uint_ru
 
 __double2uint_rz
 ******************
-:: 
+::
 
    __device__ unsigned int __double2uint_rz(double x);
 
@@ -2826,7 +2826,7 @@ __double2uint_rz
 
 __double2ull_rd
 ******************
-:: 
+::
 
    __device__ unsigned long long int __double2ull_rd(double x);
 
@@ -2836,7 +2836,7 @@ __double2ull_rd
 
 __double2ull_rn
 ******************
-:: 
+::
 
    __device__ unsigned long long int __double2ull_rn(double x);
 
@@ -2846,7 +2846,7 @@ __double2ull_rn
 
 __double2ull_ru
 ******************
-:: 
+::
 
    __device__ unsigned long long int __double2ull_ru(double x);
 
@@ -2856,7 +2856,7 @@ __double2ull_ru
 
 __double2ull_rz
 ******************
-:: 
+::
 
    __device__ unsigned long long int __double2ull_rz(double x);
 
@@ -2866,7 +2866,7 @@ __double2ull_rz
 
 __double_as_longlong
 ***************************
-:: 
+::
 
     __device__ long long int __double_as_longlong(double x);
 
@@ -2876,7 +2876,7 @@ __double_as_longlong
 
 __float2half_rn
 ******************
-:: 
+::
 
    __device__ unsigned short __float2half_rn(float x);
 
@@ -2886,7 +2886,7 @@ __float2half_rn
 
 __half2float
 ******************
-:: 
+::
 
    __device__ float __half2float(unsigned short);
 
@@ -2896,7 +2896,7 @@ __half2float
 
 __float2half_rn
 ******************
-:: 
+::
 
    __device__ __half __float2half_rn(float x);
 
@@ -2906,7 +2906,7 @@ __float2half_rn
 
 __half2float
 ******************
-:: 
+::
 
    __device__ float __half2float(__half);
 
@@ -2916,7 +2916,7 @@ __half2float
 
 __float2int_rd
 ******************
-:: 
+::
 
    __device__ int __float2int_rd(float x);
 
@@ -2926,7 +2926,7 @@ __float2int_rd
 
 __float2int_rn
 ******************
-:: 
+::
 
    __device__ int __float2int_rn(float x);
 
@@ -2936,7 +2936,7 @@ __float2int_rn
 
 __float2int_ru
 ******************
-:: 
+::
 
    __device__ int __float2int_ru(float x);
 
@@ -2946,7 +2946,7 @@ __float2int_ru
 
 __float2int_rz
 ******************
-:: 
+::
 
   __device__ int __float2int_rz(float x);
 
@@ -2956,7 +2956,7 @@ __float2int_rz
 
 __float2ll_rd
 ******************
-:: 
+::
 
    __device__ long long int __float2ll_rd(float x);
 
@@ -2966,7 +2966,7 @@ __float2ll_rd
 
 __float2ll_rn
 ******************
-:: 
+::
 
    __device__ long long int __float2ll_rn(float x);
 
@@ -2976,7 +2976,7 @@ __float2ll_rn
 
 __float2ll_ru
 ******************
-:: 
+::
 
    __device__ long long int __float2ll_ru(float x);
 
@@ -2986,8 +2986,8 @@ __float2ll_ru
 
 __float2ll_rz
 ******************
-:: 
- 
+::
+
    __device__ long long int __float2ll_rz(float x);
 
 
@@ -2996,7 +2996,7 @@ __float2ll_rz
 
 __float2uint_rd
 ******************
-:: 
+::
 
     __device__ unsigned int __float2uint_rd(float x);
 
@@ -3006,7 +3006,7 @@ __float2uint_rd
 
 __float2uint_rn
 ******************
-:: 
+::
 
     __device__ unsigned int __float2uint_rn(float x);
 
@@ -3016,7 +3016,7 @@ __float2uint_rn
 
 __float2uint_ru
 ******************
-:: 
+::
 
    __device__ unsigned int __float2uint_ru(float x);
 
@@ -3026,7 +3026,7 @@ __float2uint_ru
 
 __float2uint_rz
 ******************
-:: 
+::
 
   __device__ unsigned int __float2uint_rz(float x);
 
@@ -3036,7 +3036,7 @@ __float2uint_rz
 
 __float2ull_rd
 ******************
-:: 
+::
 
     __device__ unsigned long long int __float2ull_rd(float x);
 
@@ -3046,7 +3046,7 @@ __float2ull_rd
 
 __float2ull_rn
 ******************
-:: 
+::
 
    __device__ unsigned long long int __float2ull_rn(float x);
 
@@ -3056,7 +3056,7 @@ __float2ull_rn
 
 __float2ull_ru
 ******************
-:: 
+::
 
    __device__ unsigned long long int __float2ull_ru(float x);
 
@@ -3066,7 +3066,7 @@ __float2ull_ru
 
 __float2ull_rz
 ******************
-:: 
+::
 
    __device__ unsigned long long int __float2ull_rz(float x);
 
@@ -3076,7 +3076,7 @@ __float2ull_rz
 
 __float_as_int
 ******************
-:: 
+::
 
    __device__ int __float_as_int(float x);
 
@@ -3086,7 +3086,7 @@ __float_as_int
 
 __float_as_uint
 ******************
-:: 
+::
 
    __device__ unsigned int __float_as_uint(float x);
 
@@ -3096,7 +3096,7 @@ __float_as_uint
 
 __hiloint2double
 ******************
-:: 
+::
 
    __device__ double __hiloint2double(int hi, int lo);
 
@@ -3106,7 +3106,7 @@ __hiloint2double
 
 __int2double_rn
 ******************
-:: 
+::
 
   __device__ double __int2double_rn(int x);
 
@@ -3116,7 +3116,7 @@ __int2double_rn
 
 __int2float_rd
 ******************
-:: 
+::
 
    __device__ float __int2float_rd(int x);
 
@@ -3126,7 +3126,7 @@ __int2float_rd
 
 __int2float_rn
 ******************
-:: 
+::
 
   __device__ float __int2float_rn(int x);
 
@@ -3136,7 +3136,7 @@ __int2float_rn
 
 __int2float_ru
 ******************
-:: 
+::
 
   __device__ float __int2float_ru(int x);
 
@@ -3146,7 +3146,7 @@ __int2float_ru
 
 __int2float_rz
 ******************
-:: 
+::
 
   __device__ float __int2float_rz(int x);
 
@@ -3157,7 +3157,7 @@ __int2float_rz
 __int_as_float
 ******************
 
-:: 
+::
 
   __device__ float __int_as_float(int x);
 
@@ -3168,7 +3168,7 @@ __int_as_float
 __ll2double_rd
 ******************
 
-:: 
+::
 
    __device__ double __ll2double_rd(long long int x);
 
@@ -3178,7 +3178,7 @@ __ll2double_rd
 
 __ll2double_rn
 ******************
-:: 
+::
 
   __device__ double __ll2double_rn(long long int x);
 
@@ -3189,7 +3189,7 @@ __ll2double_rn
 __ll2double_ru
 ******************
 
-:: 
+::
 
   __device__ double __ll2double_ru(long long int x);
 
@@ -3200,7 +3200,7 @@ __ll2double_ru
 __ll2double_rz
 ******************
 
-:: 
+::
 
    __device__ double __ll2double_rz(long long int x);
 
@@ -3210,7 +3210,7 @@ __ll2double_rz
 
 __ll2float_rd
 ******************
-:: 
+::
 
    __device__ float __ll2float_rd(long long int x);
 
@@ -3220,7 +3220,7 @@ __ll2float_rd
 
 __ll2float_rn
 ******************
-:: 
+::
 
   __device__ float __ll2float_rn(long long int x);
 
@@ -3230,7 +3230,7 @@ __ll2float_rn
 
 __ll2float_ru
 ******************
-:: 
+::
 
    __device__ float __ll2float_ru(long long int x);
 
@@ -3240,7 +3240,7 @@ __ll2float_ru
 
 __ll2float_rz
 ******************
-:: 
+::
 
   __device__ float __ll2float_rz(long long int x);
 
@@ -3250,7 +3250,7 @@ __ll2float_rz
 
 __longlong_as_double
 ***************************
-:: 
+::
 
    __device__ double __longlong_as_double(long long int x);
 
@@ -3260,7 +3260,7 @@ __longlong_as_double
 
 __uint2double_rn
 ******************
-:: 
+::
 
    __device__ double __uint2double_rn(int x);
 
@@ -3270,7 +3270,7 @@ __uint2double_rn
 
 __uint2float_rd
 ******************
-:: 
+::
 
    __device__ float __uint2float_rd(unsigned int x);
 
@@ -3280,7 +3280,7 @@ __uint2float_rd
 
 __uint2float_rn
 ******************
-:: 
+::
 
    __device__ float __uint2float_rn(unsigned int x);
 
@@ -3290,7 +3290,7 @@ __uint2float_rn
 
 __uint2float_ru
 ******************
-:: 
+::
 
    __device__ float __uint2float_ru(unsigned int x);
 
@@ -3300,7 +3300,7 @@ __uint2float_ru
 
 __uint2float_rz
 ******************
-:: 
+::
 
    __device__ float __uint2float_rz(unsigned int x);
 
@@ -3310,7 +3310,7 @@ __uint2float_rz
 
 __uint_as_float
 ******************
-:: 
+::
 
    __device__ float __uint_as_float(unsigned int x);
 
@@ -3320,7 +3320,7 @@ __uint_as_float
 
 __ull2double_rd
 ******************
-:: 
+::
 
    __device__ double __ull2double_rd(unsigned long long int x);
 
@@ -3330,7 +3330,7 @@ __ull2double_rd
 
 __ull2double_rn
 ******************
-:: 
+::
 
    __device__ double __ull2double_rn(unsigned long long int x);
 
@@ -3340,7 +3340,7 @@ __ull2double_rn
 
 __ull2double_ru
 ******************
-:: 
+::
 
    __device__ double __ull2double_ru(unsigned long long int x);
 
@@ -3350,7 +3350,7 @@ __ull2double_ru
 
 __ull2double_rz
 ******************
-:: 
+::
 
   __device__ double __ull2double_rz(unsigned long long int x);
 
@@ -3360,7 +3360,7 @@ __ull2double_rz
 
 __ull2float_rd
 ******************
-:: 
+::
 
    __device__ float __ull2float_rd(unsigned long long int x);
 
@@ -3370,7 +3370,7 @@ __ull2float_rd
 
 __ull2float_rn
 ******************
-:: 
+::
 
    __device__ float __ull2float_rn(unsigned long long int x);
 
@@ -3381,7 +3381,7 @@ __ull2float_rn
 __ull2float_ru
 ******************
 
-:: 
+::
 
    __device__ float __ull2float_ru(unsigned long long int x);
 
@@ -3391,7 +3391,7 @@ __ull2float_ru
 
 __ull2float_rz
 ******************
-:: 
+::
 
    __device__ float __ull2float_rz(unsigned long long int x);
 
@@ -3401,7 +3401,7 @@ __ull2float_rz
 
 __hadd
 *********
-:: 
+::
 
    __device__ static __half __hadd(const __half a, const __half b);
 
@@ -3411,7 +3411,7 @@ __hadd
 
 __hadd_sat
 ******************
-:: 
+::
 
    __device__ static __half __hadd_sat(__half a, __half b);
 
@@ -3421,7 +3421,7 @@ __hadd_sat
 
 __hfma
 *********
-:: 
+::
 
   __device__ static __half __hfma(__half a, __half b, __half c);
 
@@ -3431,7 +3431,7 @@ __hfma
 
 __hfma_sat
 *********
-:: 
+::
 
   __device__ static __half __hfma_sat(__half a, __half b, __half c);
 
@@ -3441,7 +3441,7 @@ __hfma_sat
 
 __hmul
 *********
-:: 
+::
 
   __device__ static __half __hmul(__half a, __half b);
 
@@ -3451,7 +3451,7 @@ __hmul
 
 __hmul_sat
 *********
-:: 
+::
 
   __device__ static __half __hmul_sat(__half a, __half b);
 
@@ -3461,7 +3461,7 @@ __hmul_sat
 
 __hneg
 *********
-:: 
+::
 
    __device__ static __half __hneg(__half a);
 
@@ -3471,7 +3471,7 @@ __hneg
 
 __hsub
 *********
-:: 
+::
 
    __device__ static __half __hsub(__half a, __half b);
 
@@ -3481,7 +3481,7 @@ __hsub
 
 __hsub_sat
 *********
-:: 
+::
 
    __device__ static __half __hsub_sat(__half a, __half b);
 
@@ -3491,7 +3491,7 @@ __hsub_sat
 
 hdiv
 *********
-:: 
+::
 
    __device__ static __half hdiv(__half a, __half b);
 
@@ -3501,7 +3501,7 @@ hdiv
 
 __hadd2
 *********
-:: 
+::
 
    __device__ static __half2 __hadd2(__half2 a, __half2 b);
 
@@ -3511,7 +3511,7 @@ __hadd2
 
 __hadd2_sat
 ******************
-:: 
+::
 
    __device__ static __half2 __hadd2_sat(__half2 a, __half2 b);
 
@@ -3521,7 +3521,7 @@ __hadd2_sat
 
 __hfma2
 *********
-:: 
+::
 
   __device__ static __half2 __hfma2(__half2 a, __half2 b, __half2 c);
 
@@ -3531,7 +3531,7 @@ __hfma2
 
 __hfma2_sat
 ******************
-:: 
+::
 
    __device__ static __half2 __hfma2_sat(__half2 a, __half2 b, __half2 c);
 
@@ -3541,7 +3541,7 @@ __hfma2_sat
 
 __hmul2
 *********
-:: 
+::
 
    __device__ static __half2 __hmul2(__half2 a, __half2 b);
 
@@ -3551,7 +3551,7 @@ __hmul2
 
 __hmul2_sat
 ******************
-:: 
+::
 
    __device__ static __half2 __hmul2_sat(__half2 a, __half2 b);
 
@@ -3561,7 +3561,7 @@ __hmul2_sat
 
 __hsub2
 *********
-:: 
+::
 
    __device__ static __half2 __hsub2(__half2 a, __half2 b);
 
@@ -3571,7 +3571,7 @@ __hsub2
 
 __hneg2
 *********
-:: 
+::
 
    __device__ static __half2 __hneg2(__half2 a);
 
@@ -3581,7 +3581,7 @@ __hneg2
 
 __hsub2_sat
 ******************
-:: 
+::
 
   __device__ static __half2 __hsub2_sat(__half2 a, __half2 b);
 
@@ -3591,7 +3591,7 @@ __hsub2_sat
 
 h2div
 *********
-:: 
+::
 
   __device__ static __half2 h2div(__half2 a, __half2 b);
 
@@ -3601,7 +3601,7 @@ h2div
 
 __heq
 *********
-:: 
+::
 
    __device__bool __heq(__half a, __half b);
 
@@ -3611,7 +3611,7 @@ __heq
 
 __hge
 *********
-:: 
+::
 
    __device__bool __hge(__half a, __half b);
 
@@ -3621,7 +3621,7 @@ __hge
 
 __hgt
 *********
-:: 
+::
 
   __device__bool __hgt(__half a, __half b);
 
@@ -3631,7 +3631,7 @@ __hgt
 
 __hisinf
 *********
-:: 
+::
 
    __device__bool __hisinf(__half a);
 
@@ -3641,7 +3641,7 @@ __hisinf
 
 __hisnan
 *********
-:: 
+::
 
   __device__bool __hisnan(__half a);
 
@@ -3651,7 +3651,7 @@ __hisnan
 
 __hle
 *********
-:: 
+::
 
    __device__bool __hle(__half a, __half b);
 
@@ -3661,7 +3661,7 @@ __hle
 
 __hlt
 *********
-:: 
+::
 
    __device__bool __hlt(__half a, __half b);
 
@@ -3671,7 +3671,7 @@ __hlt
 
 __hne
 *********
-:: 
+::
 
    __device__bool __hne(__half a, __half b);
 
@@ -3681,7 +3681,7 @@ __hne
 
 __hbeq2
 *********
-:: 
+::
 
    __device__bool __hbeq2(__half2 a, __half2 b);
 
@@ -3691,7 +3691,7 @@ __hbeq2
 
 __hbge2
 *********
-:: 
+::
 
    __device__bool __hbge2(__half2 a, __half2 b);
 
@@ -3701,7 +3701,7 @@ __hbge2
 
 __hbgt2
 *********
-:: 
+::
 
    __device__bool __hbgt2(__half2 a, __half2 b);
 
@@ -3711,7 +3711,7 @@ __hbgt2
 
 __hble2
 *********
-:: 
+::
 
   __device__bool __hble2(__half2 a, __half2 b);
 
@@ -3721,7 +3721,7 @@ __hble2
 
 __hblt2
 *********
-:: 
+::
 
    __device__bool __hblt2(__half2 a, __half2 b);
 
@@ -3731,7 +3731,7 @@ __hblt2
 
 __hbne2
 *********
-:: 
+::
 
    __device__bool __hbne2(__half2 a, __half2 b);
 
@@ -3741,7 +3741,7 @@ __hbne2
 
 __heq2
 *********
-:: 
+::
 
    __device____half2 __heq2(__half2 a, __half2 b);
 
@@ -3751,7 +3751,7 @@ __heq2
 
 __hge2
 *********
-:: 
+::
 
    __device____half2 __hge2(__half2 a, __half2 b);
 
@@ -3761,7 +3761,7 @@ __hge2
 
 __hgt2
 *********
-:: 
+::
 
    __device____half2 __hgt2(__half2 a, __half2 b);
 
@@ -3771,7 +3771,7 @@ __hgt2
 
 __hisnan2
 *********
-:: 
+::
 
    __device____half2 __hisnan2(__half2 a);
 
@@ -3781,7 +3781,7 @@ __hisnan2
 
 __hle2
 *********
-:: 
+::
 
   __device____half2 __hle2(__half2 a, __half2 b);
 
@@ -3791,7 +3791,7 @@ __hle2
 
 __hlt2
 *********
-:: 
+::
 
   __device____half2 __hlt2(__half2 a, __half2 b);
 
@@ -3801,7 +3801,7 @@ __hlt2
 
 __hne2
 *********
-:: 
+::
 
   __device____half2 __hne2(__half2 a, __half2 b);
 
@@ -3811,7 +3811,7 @@ __hne2
 
 hceil
 *********
-:: 
+::
 
   __device__ static __half hceil(const __half h);
 
@@ -3821,7 +3821,7 @@ hceil
 
 hcos
 *********
-:: 
+::
 
    __device__ static __half hcos(const __half h);
 
@@ -3831,8 +3831,8 @@ hcos
 
 hexp
 *********
-:: 
- 
+::
+
    __device__ static __half hexp(const __half h);
 
 
@@ -3841,7 +3841,7 @@ hexp
 
 hexp10
 *********
-:: 
+::
 
    __device__ static __half hexp10(const __half h);
 
@@ -3851,7 +3851,7 @@ hexp10
 
 hexp2
 *********
-:: 
+::
 
     __device__ static __half hexp2(const __half h);
 
@@ -3861,7 +3861,7 @@ hexp2
 
 hfloor
 *********
-:: 
+::
 
    __device__ static __half hfloor(const __half h);
 
@@ -3871,7 +3871,7 @@ hfloor
 
 hlog
 *********
-:: 
+::
 
    __device__ static __half hlog(const __half h);
 
@@ -3881,7 +3881,7 @@ hlog
 
 hlog10
 *********
-:: 
+::
 
    __device__ static __half hlog10(const __half h);
 
@@ -3891,7 +3891,7 @@ hlog10
 
 hlog2
 *********
-:: 
+::
 
    __device__ static __half hlog2(const __half h);
 
@@ -3901,8 +3901,8 @@ hlog2
 
 hrcp
 *********
-:: 
- 
+::
+
     //__device__ static __half hrcp(const __half h);
 
 
@@ -3911,7 +3911,7 @@ hrcp
 
 hrint
 *********
-:: 
+::
 
    __device__ static __half hrint(const __half h);
 
@@ -3921,7 +3921,7 @@ hrint
 
 hsin
 *********
-:: 
+::
 
   __device__ static __half hsin(const __half h);
 
@@ -3931,7 +3931,7 @@ hsin
 
 hsqrt
 *********
-:: 
+::
 
    __device__ static __half hsqrt(const __half a);
 
@@ -3941,7 +3941,7 @@ hsqrt
 
 htrunc
 *********
-:: 
+::
 
    __device__ static __half htrunc(const __half a);
 
@@ -3951,7 +3951,7 @@ htrunc
 
 h2ceil
 *********
-:: 
+::
 
    __device__ static __half2 h2ceil(const __half2 h);
 
@@ -3961,7 +3961,7 @@ h2ceil
 
 h2exp
 *********
-:: 
+::
 
   __device__ static __half2 h2exp(const __half2 h);
 
@@ -3971,7 +3971,7 @@ h2exp
 
 h2exp10
 *********
-:: 
+::
 
   __device__ static __half2 h2exp10(const __half2 h);
 
@@ -3981,7 +3981,7 @@ h2exp10
 
 h2exp2
 *********
-:: 
+::
 
    __device__ static __half2 h2exp2(const __half2 h);
 
@@ -3991,7 +3991,7 @@ h2exp2
 
 h2floor
 *********
-:: 
+::
 
    __device__ static __half2 h2floor(const __half2 h);
 
@@ -4001,7 +4001,7 @@ h2floor
 
 h2log
 *********
-:: 
+::
 
    __device__ static __half2 h2log(const __half2 h);
 
@@ -4011,7 +4011,7 @@ h2log
 
 h2log10
 *********
-:: 
+::
 
     __device__ static __half2 h2log10(const __half2 h);
 
@@ -4021,7 +4021,7 @@ h2log10
 
 h2log2
 *********
-:: 
+::
 
     __device__ static __half2 h2log2(const __half2 h);
 
@@ -4031,7 +4031,7 @@ h2log2
 
 h2rcp
 *********
-:: 
+::
 
    __device__ static __half2 h2rcp(const __half2 h);
 
@@ -4041,8 +4041,8 @@ h2rcp
 
 h2rsqrt
 *********
-:: 
-  
+::
+
    __device__ static __half2 h2rsqrt(const __half2 h);
 
 
@@ -4050,8 +4050,8 @@ h2rsqrt
 
 
 h2sin
-********* 
-:: 
+*********
+::
 
    __device__ static __half2 h2sin(const __half2 h);
 
@@ -4061,8 +4061,8 @@ h2sin
 
 h2sqrt
 *********
-:: 
- 
+::
+
    __device__ static __half2 h2sqrt(const __half2 h);
 
 
@@ -4071,7 +4071,7 @@ h2sqrt
 
 __float22half2_rn
 ******************
-:: 
+::
 
    __device____half2 __float22half2_rn(const float2 a);
 
@@ -4081,7 +4081,7 @@ __float22half2_rn
 
 __float2half
 ******************
-:: 
+::
 
    __device____half __float2half(const float a);
 
@@ -4091,8 +4091,8 @@ __float2half
 
 __float2half2_rn
 ******************
-:: 
- 
+::
+
    __device____half2 __float2half2_rn(const float a);
 
 
@@ -4101,7 +4101,7 @@ __float2half2_rn
 
 __float2half_rd
 ******************
-:: 
+::
 
    __device____half __float2half_rd(const float a);
 
@@ -4111,7 +4111,7 @@ __float2half_rd
 
 __float2half_rn
 ******************
-:: 
+::
 
    __device____half __float2half_rn(const float a);
 
@@ -4121,7 +4121,7 @@ __float2half_rn
 
 __float2half_ru
 ******************
-:: 
+::
 
    __device____half __float2half_ru(const float a);
 
@@ -4131,7 +4131,7 @@ __float2half_ru
 
 __float2half_rz
 ******************
-:: 
+::
 
     __device____half __float2half_rz(const float a);
 
@@ -4141,7 +4141,7 @@ __float2half_rz
 
 __floats2half2_rn
 ******************
-:: 
+::
 
    __device____half2 __floats2half2_rn(const float a, const float b);
 
@@ -4151,7 +4151,7 @@ __floats2half2_rn
 
 __half22float2
 ******************
-:: 
+::
 
    __device__float2 __half22float2(const __half2 a);
 
@@ -4161,7 +4161,7 @@ __half22float2
 
 __half2float
 ******************
-:: 
+::
 
   __device__float __half2float(const __half a);
 
@@ -4171,7 +4171,7 @@ __half2float
 
 half2half2
 ******************
-:: 
+::
 
    __device____half2 half2half2(const __half a);
 
@@ -4181,7 +4181,7 @@ half2half2
 
 __half2int_rd
 ******************
-:: 
+::
 
    __device__int __half2int_rd(__half h);
 
@@ -4191,7 +4191,7 @@ __half2int_rd
 
 __half2int_rn
 ******************
-:: 
+::
 
    __device__int __half2int_rn(__half h);
 
@@ -4201,7 +4201,7 @@ __half2int_rn
 
 __half2int_ru
 ******************
-:: 
+::
 
     __device__int __half2int_ru(__half h);
 
@@ -4211,7 +4211,7 @@ __half2int_ru
 
 __half2int_rz
 ******************
-:: 
+::
 
    __device__int __half2int_rz(__half h);
 
@@ -4221,7 +4221,7 @@ __half2int_rz
 
 __half2ll_rd
 ******************
-:: 
+::
 
    __device__long long int __half2ll_rd(__half h);
 
@@ -4231,7 +4231,7 @@ __half2ll_rd
 
 __half2ll_rn
 ******************
-:: 
+::
 
     __device__long long int __half2ll_rn(__half h);
 
@@ -4241,7 +4241,7 @@ __half2ll_rn
 
 __half2ll_ru
 ******************
-:: 
+::
 
    __device__long long int __half2ll_ru(__half h);
 
@@ -4251,7 +4251,7 @@ __half2ll_ru
 
 __half2ll_rz
 ******************
-:: 
+::
 
    __device__long long int __half2ll_rz(__half h);
 
@@ -4261,7 +4261,7 @@ __half2ll_rz
 
 __half2short_rd
 ******************
-:: 
+::
 
   __device__short __half2short_rd(__half h);
 
@@ -4271,7 +4271,7 @@ __half2short_rd
 
 __half2short_rn
 ******************
-:: 
+::
 
    __device__short __half2short_rn(__half h);
 
@@ -4281,7 +4281,7 @@ __half2short_rn
 
 __half2short_ru
 ******************
-:: 
+::
 
    __device__short __half2short_ru(__half h);
 
@@ -4292,7 +4292,7 @@ __half2short_ru
 
 __half2short_rz
 ******************
-:: 
+::
 
     __device__short __half2short_rz(__half h);
 
@@ -4302,7 +4302,7 @@ __half2short_rz
 
 __half2uint_rd
 ******************
-:: 
+::
 
   __device__unsigned int __half2uint_rd(__half h);
 
@@ -4312,7 +4312,7 @@ __half2uint_rd
 
 __half2uint_rn
 ******************
-:: 
+::
 
    __device__unsigned int __half2uint_rn(__half h);
 
@@ -4322,7 +4322,7 @@ __half2uint_rn
 
 __half2uint_ru
 ******************
-:: 
+::
 
   __device__unsigned int __half2uint_ru(__half h);
 
@@ -4332,7 +4332,7 @@ __half2uint_ru
 
 __half2uint_rz
 ******************
-:: 
+::
 
    __device__unsigned int __half2uint_rz(__half h);
 
@@ -4342,7 +4342,7 @@ __half2uint_rz
 
 __half2ull_rd
 ******************
-:: 
+::
 
    __device__unsigned long long int __half2ull_rd(__half h);
 
@@ -4352,7 +4352,7 @@ __half2ull_rd
 
 __half2ull_rn
 ******************
-:: 
+::
 
    __device__unsigned long long int __half2ull_rn(__half h);
 
@@ -4362,7 +4362,7 @@ __half2ull_rn
 
 __half2ull_ru
 ******************
-:: 
+::
 
    __device__unsigned long long int __half2ull_ru(__half h);
 
@@ -4372,7 +4372,7 @@ __half2ull_ru
 
 __half2ull_rz
 ******************
-:: 
+::
 
   __device__unsigned long long int __half2ull_rz(__half h);
 
@@ -4382,7 +4382,7 @@ __half2ull_rz
 
 __half2ushort_rd
 ******************
-:: 
+::
 
   __device__unsigned short int __half2ushort_rd(__half h);
 
@@ -4392,7 +4392,7 @@ __half2ushort_rd
 
 __half2ushort_rn
 ******************
-:: 
+::
 
   __device__unsigned short int __half2ushort_rn(__half h);
 
@@ -4402,7 +4402,7 @@ __half2ushort_rn
 
 __half2ushort_ru
 ******************
-:: 
+::
 
    __device__unsigned short int __half2ushort_ru(__half h);
 
@@ -4412,7 +4412,7 @@ __half2ushort_ru
 
 __half2ushort_rz
 ******************
-:: 
+::
 
   __device__unsigned short int __half2ushort_rz(__half h);
 
@@ -4422,7 +4422,7 @@ __half2ushort_rz
 
 __half_as_short
 ******************
-:: 
+::
 
    __device__short int __half_as_short(const __half h);
 
@@ -4432,7 +4432,7 @@ __half_as_short
 
 __half_as_ushort
 ******************
-:: 
+::
 
    __device__unsigned short int __half_as_ushort(const __half h);
 
@@ -4442,7 +4442,7 @@ __half_as_ushort
 
 __halves2half2
 ******************
-:: 
+::
 
   __device____half2 __halves2half2(const __half a, const __half b);
 
@@ -4452,8 +4452,8 @@ __halves2half2
 
 __high2float
 ******************
-:: 
- 
+::
+
    __device__float __high2float(const __half2 a);
 
 
@@ -4462,7 +4462,7 @@ __high2float
 
 __high2half
 ******************
-:: 
+::
 
   __device____half __high2half(const __half2 a);
 
@@ -4472,7 +4472,7 @@ __high2half
 
 __high2half2
 ******************
-:: 
+::
 
   __device____half2 __high2half2(const __half2 a);
 
@@ -4482,7 +4482,7 @@ __high2half2
 
 __highs2half2
 ******************
-:: 
+::
 
    __device____half2 __highs2half2(const __half2 a, const __half2 b);
 
@@ -4492,7 +4492,7 @@ __highs2half2
 
 __int2half_rd
 ******************
-:: 
+::
 
    __device____half __int2half_rd(int i);
 
@@ -4502,7 +4502,7 @@ __int2half_rd
 
 __int2half_rn
 ******************
-:: 
+::
 
   __device____half __int2half_rn(int i);
 
@@ -4512,7 +4512,7 @@ __int2half_rn
 
 __int2half_ru
 ******************
-:: 
+::
 
   __device____half __int2half_ru(int i);
 
@@ -4522,7 +4522,7 @@ __int2half_ru
 
 __int2half_rz
 ******************
-:: 
+::
 
   __device____half __int2half_rz(int i);
 
@@ -4532,7 +4532,7 @@ __int2half_rz
 
 __ll2half_rd
 ******************
-:: 
+::
 
   __device____half __ll2half_rd(long long int i);
 
@@ -4542,7 +4542,7 @@ __ll2half_rd
 
 __ll2half_rn
 ******************
-:: 
+::
 
    __device____half __ll2half_rn(long long int i);
 
@@ -4552,7 +4552,7 @@ __ll2half_rn
 
 __ll2half_ru
 ******************
-:: 
+::
 
   __device____half __ll2half_ru(long long int i);
 
@@ -4562,7 +4562,7 @@ __ll2half_ru
 
 __ll2half_rz
 ******************
-:: 
+::
 
   __device____half __ll2half_rz(long long int i);
 
@@ -4572,7 +4572,7 @@ __ll2half_rz
 
 __low2float
 ******************
-:: 
+::
 
    __device__float __low2float(const __half2 a);
 
@@ -4582,7 +4582,7 @@ __low2float
 
 __low2half
 ******************
-:: 
+::
 
    __device__ __half __low2half(const __half2 a);
 
@@ -4592,7 +4592,7 @@ __low2half
 
 __low2half2
 ******************
-:: 
+::
 
    __device__ __half2 __low2half2(const __half2 a, const __half2 b);
 
@@ -4602,7 +4602,7 @@ __low2half2
 
 __low2half2
 ******************
-:: 
+::
 
    __device__ __half2 __low2half2(const __half2 a);
 
@@ -4612,7 +4612,7 @@ __low2half2
 
 __lowhigh2highlow
 ******************
-:: 
+::
 
    __device__ __half2 __lowhigh2highlow(const __half2 a);
 
@@ -4622,7 +4622,7 @@ __lowhigh2highlow
 
 __lows2half2
 ******************
-:: 
+::
 
    __device__ __half2 __lows2half2(const __half2 a, const __half2 b);
 
@@ -4632,7 +4632,7 @@ __lows2half2
 
 __short2half_rd
 ******************
-:: 
+::
 
   __device____half __short2half_rd(short int i);
 
@@ -4642,7 +4642,7 @@ __short2half_rd
 
 __short2half_rn
 ******************
-:: 
+::
 
   __device____half __short2half_rn(short int i);
 
@@ -4652,7 +4652,7 @@ __short2half_rn
 
 __short2half_ru
 ******************
-:: 
+::
 
   __device____half __short2half_ru(short int i);
 
@@ -4662,7 +4662,7 @@ __short2half_ru
 
 __short2half_rz
 ******************
-:: 
+::
 
   __device____half __short2half_rz(short int i);
 
@@ -4672,7 +4672,7 @@ __short2half_rz
 
 __uint2half_rd
 ******************
-:: 
+::
 
   __device____half __uint2half_rd(unsigned int i);
 
@@ -4682,7 +4682,7 @@ __uint2half_rd
 
 __uint2half_rn
 ******************
-:: 
+::
 
   __device____half __uint2half_rn(unsigned int i);
 
@@ -4692,7 +4692,7 @@ __uint2half_rn
 
 __uint2half_ru
 ******************
-:: 
+::
 
    __device____half __uint2half_ru(unsigned int i);
 
@@ -4702,7 +4702,7 @@ __uint2half_ru
 
 __uint2half_rz
 ******************
-:: 
+::
 
    __device____half __uint2half_rz(unsigned int i);
 
@@ -4712,7 +4712,7 @@ __uint2half_rz
 
 __ull2half_rd
 ******************
-:: 
+::
 
    __device____half __ull2half_rd(unsigned long long int i);
 
@@ -4722,7 +4722,7 @@ __ull2half_rd
 
 __ull2half_rn
 ******************
-:: 
+::
 
    __device____half __ull2half_rn(unsigned long long int i);
 
@@ -4732,7 +4732,7 @@ __ull2half_rn
 
 __ull2half_ru
 ******************
-:: 
+::
 
   __device____half __ull2half_ru(unsigned long long int i);
 
@@ -4742,8 +4742,8 @@ __ull2half_ru
 
 __ull2half_rz
 ******************
-:: 
- 
+::
+
    __device____half __ull2half_rz(unsigned long long int i);
 
 
@@ -4752,7 +4752,7 @@ __ull2half_rz
 
 __ushort2half_rd
 *********
-:: 
+::
 
   __device____half __ushort2half_rd(unsigned short int i);
 
@@ -4762,7 +4762,7 @@ __ushort2half_rd
 
 __ushort2half_rn
 ******************
-:: 
+::
 
   __device____half __ushort2half_rn(unsigned short int i);
 
@@ -4772,7 +4772,7 @@ __ushort2half_rn
 
 __ushort2half_ru
 ******************
-:: 
+::
 
   __device____half __ushort2half_ru(unsigned short int i);
 
@@ -4782,7 +4782,7 @@ __ushort2half_ru
 
 __ushort2half_rz
 ******************
-:: 
+::
 
   __device____half __ushort2half_rz(unsigned short int i);
 
@@ -4792,7 +4792,7 @@ __ushort2half_rz
 
 __ushort_as_half
 ******************
-:: 
+::
 
    __device____half __ushort_as_half(const unsigned short int i);
 
diff --git a/ROCm_API_References/HIP_API/Context-Management.rst b/ROCm_API_References/HIP_API/Context-Management.rst
index c5e08c69..ae7a6d34 100644
--- a/ROCm_API_References/HIP_API/Context-Management.rst
+++ b/ROCm_API_References/HIP_API/Context-Management.rst
@@ -15,29 +15,29 @@ hipCtxPopCurrent
 ----------------
 .. doxygenfunction:: hipCtxPopCurrent
 
-hipCtxPushCurrent 
+hipCtxPushCurrent
 ------------------
-.. doxygenfunction:: hipCtxPushCurrent  
+.. doxygenfunction:: hipCtxPushCurrent
 
-hipCtxSetCurrent 
+hipCtxSetCurrent
 ----------------
-.. doxygenfunction:: hipCtxSetCurrent 
+.. doxygenfunction:: hipCtxSetCurrent
 
-hipCtxGetCurrent 
+hipCtxGetCurrent
 ----------------
-.. doxygenfunction:: hipCtxGetCurrent 
+.. doxygenfunction:: hipCtxGetCurrent
 
-hipCtxGetDevice 
+hipCtxGetDevice
 ----------------
-.. doxygenfunction:: hipCtxGetDevice 
+.. doxygenfunction:: hipCtxGetDevice
 
-hipCtxGetApiVersion 
+hipCtxGetApiVersion
 --------------------
-.. doxygenfunction:: hipCtxGetApiVersion  
+.. doxygenfunction:: hipCtxGetApiVersion
 
-hipCtxGetCacheConfig 
+hipCtxGetCacheConfig
 ----------------------
-.. doxygenfunction:: hipCtxGetCacheConfig 
+.. doxygenfunction:: hipCtxGetCacheConfig
 
 hipCtxSetSharedMemConfig
 --------------------------
@@ -47,25 +47,25 @@ hipCtxGetSharedMemConfig
 --------------------------
 .. doxygenfunction:: hipCtxGetSharedMemConfig
 
-hipCtxSynchronize 
+hipCtxSynchronize
 ------------------
-.. doxygenfunction:: hipCtxSynchronize 
+.. doxygenfunction:: hipCtxSynchronize
 
-hipCtxGetFlags 
+hipCtxGetFlags
 ----------------
-.. doxygenfunction:: hipCtxGetFlags 
+.. doxygenfunction:: hipCtxGetFlags
 
-hipCtxEnablePeerAccess 
+hipCtxEnablePeerAccess
 ------------------------
-.. doxygenfunction:: hipCtxEnablePeerAccess 
+.. doxygenfunction:: hipCtxEnablePeerAccess
 
-hipCtxDisablePeerAccess  
+hipCtxDisablePeerAccess
 ------------------------
-.. doxygenfunction:: hipCtxDisablePeerAccess 
+.. doxygenfunction:: hipCtxDisablePeerAccess
 
-hipDevicePrimaryCtxGetState 
+hipDevicePrimaryCtxGetState
 -----------------------------
-.. doxygenfunction:: hipDevicePrimaryCtxGetState 
+.. doxygenfunction:: hipDevicePrimaryCtxGetState
 
 hipDevicePrimaryCtxRelease
 ----------------------------
@@ -77,11 +77,11 @@ hipDevicePrimaryCtxRetain
 
 hipDevicePrimaryCtxReset
 ---------------------------
-.. doxygenfunction:: hipDevicePrimaryCtxReset 
+.. doxygenfunction:: hipDevicePrimaryCtxReset
 
-hipDevicePrimaryCtxSetFlags 
+hipDevicePrimaryCtxSetFlags
 ----------------------------
-.. doxygenfunction:: hipDevicePrimaryCtxSetFlags 
+.. doxygenfunction:: hipDevicePrimaryCtxSetFlags
 
 
 
diff --git a/ROCm_API_References/HIP_API/Control.rst b/ROCm_API_References/HIP_API/Control.rst
index f85012b5..239d4fe6 100644
--- a/ROCm_API_References/HIP_API/Control.rst
+++ b/ROCm_API_References/HIP_API/Control.rst
@@ -7,9 +7,9 @@ hipProfilerStart
 ----------------
 .. doxygenfunction:: hipProfilerStart
 
-hipProfilerStop 
+hipProfilerStop
 ----------------
-.. doxygenfunction::hipProfilerStop 
+.. doxygenfunction::hipProfilerStop
 
 
 
diff --git a/ROCm_API_References/HIP_API/Device-Memory-Access.rst b/ROCm_API_References/HIP_API/Device-Memory-Access.rst
index fc35e2a8..42d98d50 100644
--- a/ROCm_API_References/HIP_API/Device-Memory-Access.rst
+++ b/ROCm_API_References/HIP_API/Device-Memory-Access.rst
@@ -7,9 +7,9 @@ hipDeviceCanAccessPeer
 ------------------------
 .. doxygenfunction:: hipDeviceCanAccessPeer
 
-hipDeviceEnablePeerAccess 
+hipDeviceEnablePeerAccess
 ---------------------------
-.. doxygenfunction:: hipDeviceEnablePeerAccess 
+.. doxygenfunction:: hipDeviceEnablePeerAccess
 
 hipDeviceDisablePeerAccess
 ----------------------------
@@ -23,8 +23,8 @@ hipMemcpyPeer
 ------------------------
 .. doxygenfunction:: hipMemcpyPeer
 
-hipMemcpyPeerAsync 
+hipMemcpyPeerAsync
 ------------------------
-.. doxygenfunction:: hipMemcpyPeerAsync 
+.. doxygenfunction:: hipMemcpyPeerAsync
 
 
diff --git a/ROCm_API_References/HIP_API/Device-management.rst b/ROCm_API_References/HIP_API/Device-management.rst
index 070e429b..81a8ef22 100644
--- a/ROCm_API_References/HIP_API/Device-management.rst
+++ b/ROCm_API_References/HIP_API/Device-management.rst
@@ -1,20 +1,20 @@
 .. _Device-management:
 
-   
+
 Device management
 ==================
 
 Device management types and functions.
 
-hipDeviceSynchronize	
+hipDeviceSynchronize
 -----------------------
 
-.. doxygenfunction:: hipDeviceSynchronize	
+.. doxygenfunction:: hipDeviceSynchronize
 
-hipDeviceReset 
+hipDeviceReset
 ---------------
 
-.. doxygenfunction:: hipDeviceReset 
+.. doxygenfunction:: hipDeviceReset
 
 hipSetDevice
 -------------
@@ -24,7 +24,7 @@ hipSetDevice
 hipGetDevice
 ----------------
 
-.. doxygenfunction:: hipGetDevice	
+.. doxygenfunction:: hipGetDevice
 
 hipGetDeviceCount
 -----------------
@@ -53,14 +53,14 @@ hipDeviceGetLimit
 ------------------
 .. doxygenfunction:: hipDeviceGetLimit
 
-hipFuncSetCacheConfig 
+hipFuncSetCacheConfig
 ----------------------
-.. doxygenfunction:: hipFuncSetCacheConfig 
+.. doxygenfunction:: hipFuncSetCacheConfig
 
 
-hipDeviceGetSharedMemConfig 
+hipDeviceGetSharedMemConfig
 ---------------------------
-.. doxygenfunction:: hipDeviceGetSharedMemConfig 
+.. doxygenfunction:: hipDeviceGetSharedMemConfig
 
 hipDeviceSetSharedMemConfig
 ----------------------------
@@ -72,9 +72,9 @@ hipSetDeviceFlags
 .. doxygenfunction:: hipSetDeviceFlags
 
 
-hipChooseDevice 
+hipChooseDevice
 ----------------
-.. doxygenfunction:: hipChooseDevice 
+.. doxygenfunction:: hipChooseDevice
 
 
 
diff --git a/ROCm_API_References/HIP_API/Error.rst b/ROCm_API_References/HIP_API/Error.rst
index 36f256f0..292ea368 100644
--- a/ROCm_API_References/HIP_API/Error.rst
+++ b/ROCm_API_References/HIP_API/Error.rst
@@ -6,17 +6,17 @@ Error Handling
 Error Handling types and functions.
 
 
-hipGetLastError 
+hipGetLastError
 ----------------
-.. doxygenfunction:: hipGetLastError 
+.. doxygenfunction:: hipGetLastError
 
-hipPeekAtLastError 
+hipPeekAtLastError
 -------------------
-.. doxygenfunction:: hipPeekAtLastError 
+.. doxygenfunction:: hipPeekAtLastError
 
-hipGetErrorName 
+hipGetErrorName
 ----------------
-.. doxygenfunction:: hipGetErrorName 
+.. doxygenfunction:: hipGetErrorName
 
 hipGetErrorString
 -------------------
diff --git a/ROCm_API_References/HIP_API/Event-Management.rst b/ROCm_API_References/HIP_API/Event-Management.rst
index 19d19993..f65d4d8b 100644
--- a/ROCm_API_References/HIP_API/Event-Management.rst
+++ b/ROCm_API_References/HIP_API/Event-Management.rst
@@ -3,13 +3,13 @@
 Event Management
 =================
 
-hipEventCreateWithFlags 
+hipEventCreateWithFlags
 ------------------------
-.. doxygenfunction:: hipEventCreateWithFlags 
+.. doxygenfunction:: hipEventCreateWithFlags
 
-hipEventCreate 
+hipEventCreate
 ----------------
-.. doxygenfunction:: hipEventCreate 
+.. doxygenfunction:: hipEventCreate
 
 hipEventRecord
 ----------------
@@ -29,7 +29,7 @@ hipEventElapsedTime
 
 hipEventQuery S
 ----------------
-.. doxygenfunction:: hipEventQuery 
+.. doxygenfunction:: hipEventQuery
 
 
 
diff --git a/ROCm_API_References/HIP_API/Initialization-and-Version.rst b/ROCm_API_References/HIP_API/Initialization-and-Version.rst
index b2b45e94..5b1da2db 100644
--- a/ROCm_API_References/HIP_API/Initialization-and-Version.rst
+++ b/ROCm_API_References/HIP_API/Initialization-and-Version.rst
@@ -12,21 +12,21 @@ hipDeviceGet
 ----------------
 .. doxygenfunction:: hipDeviceGet
 
-hipDeviceComputeCapability 
+hipDeviceComputeCapability
 -----------------------------
-.. doxygenfunction:: hipDeviceComputeCapability 
+.. doxygenfunction:: hipDeviceComputeCapability
 
-hipDeviceGetName 
+hipDeviceGetName
 ----------------
-.. doxygenfunction:: hipDeviceGetName 
+.. doxygenfunction:: hipDeviceGetName
 
-hipDeviceGetPCIBusId 
+hipDeviceGetPCIBusId
 ---------------------
-.. doxygenfunction:: hipDeviceGetPCIBusId 
+.. doxygenfunction:: hipDeviceGetPCIBusId
 
-hipDeviceGetByPCIBusId 
+hipDeviceGetByPCIBusId
 -----------------------
-.. doxygenfunction:: hipDeviceGetByPCIBusId 
+.. doxygenfunction:: hipDeviceGetByPCIBusId
 
 hipDeviceTotalMem
 ---------------------
@@ -44,9 +44,9 @@ hipModuleLoad
 ----------------
 .. doxygenfunction:: hipModuleLoad
 
-hipModuleUnload 
+hipModuleUnload
 ----------------
-.. doxygenfunction:: hipModuleUnload 
+.. doxygenfunction:: hipModuleUnload
 
 hipModuleGetFunction
 ---------------------
@@ -64,8 +64,8 @@ hipModuleLoadDataEx
 --------------------
 .. doxygenfunction:: hipModuleLoadDataEx
 
-hipModuleLaunchKernel 
+hipModuleLaunchKernel
 ----------------------
-.. doxygenfunction:: hipModuleLaunchKernel 
+.. doxygenfunction:: hipModuleLaunchKernel
 
 
diff --git a/ROCm_API_References/HIP_API/Memory-Management.rst b/ROCm_API_References/HIP_API/Memory-Management.rst
index 07da5255..fd53354c 100644
--- a/ROCm_API_References/HIP_API/Memory-Management.rst
+++ b/ROCm_API_References/HIP_API/Memory-Management.rst
@@ -7,9 +7,9 @@ hipPointerGetAttributes
 ------------------------
 .. doxygenfunction:: hipPointerGetAttributes
 
-hipMalloc 
+hipMalloc
 ------------------------
-.. doxygenfunction:: hipMalloc 
+.. doxygenfunction:: hipMalloc
 
 hipMallocHost
 ------------------------
@@ -19,9 +19,9 @@ hipHostMalloc
 ------------------------
 .. doxygenfunction:: hipHostMalloc
 
-hipHostAlloc 
+hipHostAlloc
 ------------------------
-.. doxygenfunction:: hipHostAlloc 
+.. doxygenfunction:: hipHostAlloc
 
 hipHostGetDevicePointer
 ------------------------
@@ -31,9 +31,9 @@ hipHostGetFlags
 ------------------------
 .. doxygenfunction:: hipHostGetFlags
 
-hipHostRegister 
+hipHostRegister
 ------------------------
-.. doxygenfunction:: hipHostRegister 
+.. doxygenfunction:: hipHostRegister
 
 hipHostUnregister
 ------------------------
@@ -51,9 +51,9 @@ hipFreeHost
 ------------------------
 .. doxygenfunction:: hipFreeHost
 
-hipMemcpy 
+hipMemcpy
 ------------------------
-.. doxygenfunction:: hipMemcpy 
+.. doxygenfunction:: hipMemcpy
 
 hipMemcpyHtoD
 ------------------------
@@ -87,13 +87,13 @@ hipMemcpyToSymbolAsync
 ------------------------
 .. doxygenfunction:: hipMemcpyToSymbolAsync
 
-hipMemcpyFromSymbol 
+hipMemcpyFromSymbol
 ------------------------
-.. doxygenfunction:: hipMemcpyFromSymbol 
+.. doxygenfunction:: hipMemcpyFromSymbol
 
-hipMemcpyFromSymbolAsync 
+hipMemcpyFromSymbolAsync
 ------------------------
-.. doxygenfunction:: hipMemcpyFromSymbolAsync 
+.. doxygenfunction:: hipMemcpyFromSymbolAsync
 
 hipMemcpyAsync
 ------------------------
@@ -103,21 +103,21 @@ hipMemset
 ------------------------
 .. doxygenfunction:: hipMemset
 
-hipMemsetD8 
+hipMemsetD8
 ------------------------
-.. doxygenfunction:: hipMemsetD8 
+.. doxygenfunction:: hipMemsetD8
 
-hipMemsetAsync 
+hipMemsetAsync
 ------------------------
-.. doxygenfunction:: hipMemsetAsync 
+.. doxygenfunction:: hipMemsetAsync
 
-hipMemset2D 
+hipMemset2D
 ------------------------
-.. doxygenfunction:: hipMemset2D 
+.. doxygenfunction:: hipMemset2D
 
-hipMemGetInfo 
+hipMemGetInfo
 ------------------------
-.. doxygenfunction:: hipMemGetInfo 
+.. doxygenfunction:: hipMemGetInfo
 
 hipMemPtrGetInfo
 ------------------------
diff --git a/ROCm_API_References/HIP_API/Stream-Management.rst b/ROCm_API_References/HIP_API/Stream-Management.rst
index 946ef584..3011e056 100644
--- a/ROCm_API_References/HIP_API/Stream-Management.rst
+++ b/ROCm_API_References/HIP_API/Stream-Management.rst
@@ -19,21 +19,21 @@ hipDeviceGetStreamPriorityRange
 --------------------------------
 .. doxygenfunction:: hipDeviceGetStreamPriorityRange
 
-hipStreamDestroy 
+hipStreamDestroy
 ----------------
-.. doxygenfunction:: hipStreamDestroy 
+.. doxygenfunction:: hipStreamDestroy
 
-hipStreamQuery 
+hipStreamQuery
 ----------------
-.. doxygenfunction:: hipStreamQuery 
+.. doxygenfunction:: hipStreamQuery
 
 hipStreamSynchronize
 ---------------------
 .. doxygenfunction:: hipStreamSynchronize
 
-hipStreamWaitEvent 
+hipStreamWaitEvent
 -------------------
-.. doxygenfunction:: hipStreamWaitEvent 
+.. doxygenfunction:: hipStreamWaitEvent
 
 hipStreamGetFlags
 ----------------
@@ -43,6 +43,6 @@ hipStreamGetPriority
 ---------------------
 .. doxygenfunction:: hipStreamGetPriority
 
-hipStreamAddCallback 
+hipStreamAddCallback
 ---------------------
-.. doxygenfunction:: hipStreamAddCallback 
+.. doxygenfunction:: hipStreamAddCallback
diff --git a/ROCm_API_References/ROCr-API.rst b/ROCm_API_References/ROCr-API.rst
index b0595b7e..8ad3d346 100644
--- a/ROCm_API_References/ROCr-API.rst
+++ b/ROCm_API_References/ROCr-API.rst
@@ -24,10 +24,10 @@ common definition
 Initialization and Shut Down
 -----------------------------
 
-.. doxygenfunction:: hsa_init()	
+.. doxygenfunction:: hsa_init()
    :project: rocr
 
-.. doxygenfunction:: hsa_shut_down()	
+.. doxygenfunction:: hsa_shut_down()
    :project: rocr
 
 System and Agent Information
@@ -70,12 +70,12 @@ System and Agent Information
 .. doxygenfunction:: hsa_agent_get_info()
    :project: rocr
 
-.. doxygenfunction:: hsa_agent_iterate_caches() 
+.. doxygenfunction:: hsa_agent_iterate_caches()
    :project: rocr
- 
+
 .. doxygenfunction:: hsa_agent_major_extension_supported()
    :project: rocr
- 
+
 .. doxygenfunction:: hsa_cache_get_info()
    :project: rocr
 
diff --git a/ROCm_API_References/Thrust.rst b/ROCm_API_References/Thrust.rst
index de16e55b..0c868090 100644
--- a/ROCm_API_References/Thrust.rst
+++ b/ROCm_API_References/Thrust.rst
@@ -1,7 +1,7 @@
 
 .. _HIP-thrust:
 
-hipThrust 
+hipThrust
 ##########
 
 HIP back-end for Thrust
@@ -27,7 +27,7 @@ AMD ROCm Installation
  $ sudo sh -c 'echo deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ xenial main > /etc/apt/sources.list.d/rocm.list'
  $ sudo apt-get update
  $ sudo apt install rocm-dkms
- 
+
 Thrust Build Steps:
 ::
  $ git clone https://github.com/ROCmSoftwarePlatform/Thrust.git
@@ -46,13 +46,13 @@ Steps to follow:
  $ cd examples
  $ ./cu_to_cpp.sh
  $ ./script_compile_testing_hcc.sh
- 
+
 
 To execute applications:
 ::
   $ cd Thrust/
   $ ./script_run_hcc.sh foldername (eg:examples/testing/performance)
- 
+
 
 
 Sample applications
@@ -69,7 +69,7 @@ transform_iterator:
  sequence : 0 1 2 3 4 5 6 7 8 9
  clamped sequence : 1 1 2 3 4 5 5 5 5 5
  negated sequence : -1 -1 -2 -3 -4 -5 -5 -5 -5 -5
- negated values : -2 -5 -7 -1 -6 0 -3 -8 
+ negated values : -2 -5 -7 -1 -6 0 -3 -8
 
 sort:
 ::
@@ -106,38 +106,38 @@ expand:
 ::
  $ ./expand.out
  Expanding values according to counts
- counts 3 5 2 0 1 3 4 2 4 
- values 1 2 3 4 5 6 7 8 9 
- output 1 1 1 2 2 2 2 2 3 3 5 6 6 6 7 7 7 7 8 8 9 9 9 9 
- 
+ counts 3 5 2 0 1 3 4 2 4
+ values 1 2 3 4 5 6 7 8 9
+ output 1 1 1 2 2 2 2 2 3 3 5 6 6 6 7 7 7 7 8 8 9 9 9 9
+
 
 Unit Test
 ************
 
-| The test suite consists of unit tests. 
+| The test suite consists of unit tests.
 | Run the following commands to perform unit testing of different components of Thrust.
 
 .. note:: Set HIP_PLATFORM to either NVCC or HCC depending on the platform being used
 ::
-  
+
   $ cd Thrust/testing
   $ ./cu_to_cpp.sh
   $ ./script_compile_testing_hcc.sh
 
-To execute unit tests: 
+To execute unit tests:
 ::
   $ cd Thrust/
   $ ./script_run_hcc.sh testing/
 
 Sample output of transform and Max element test cases
 ::
-  
- ./transform.out 
+
+ ./transform.out
  Running 34 unit tests.
  ..................................
  Totals: 0 failures, 0 known failures, 0 errors, and 34 passes.
  Time: 0.366667 minutes
- 
+
  ./max_element.out
  Running 7 unit tests.
  ..................................
@@ -152,20 +152,20 @@ Run the following commands to exercise Performance tests in Thrust
 .. note:: Set HIP_PLATFORM to either NVCC or HCC depending on the platform being used
 
 ::
-   
+
   $ cd Thrust/performance
   $ ./script_compile_performance.sh
 
-To execute performance tests: 
-:: 
+To execute performance tests:
+::
   $ cd Thrust/
   $ ./script_run_hcc.sh performance/
-  
+
 
 ::
-  
+
   ./adjacent_difference.cpp.out
-   
+
   <?xml version="1.0"?>
   <testsuite name="adjacent_difference">
   <platform>
@@ -195,7 +195,7 @@ To execute performance tests:
   <status result="Success" message=""/>
   </test>
   </testsuite>
-  
+
 
 
 
diff --git a/ROCm_API_References/api.rst b/ROCm_API_References/api.rst
index bdfb6ff3..bf80aac6 100644
--- a/ROCm_API_References/api.rst
+++ b/ROCm_API_References/api.rst
@@ -1,12 +1,12 @@
 .. toctree::
-   :maxdepth: 4 
+   :maxdepth: 4
    :caption: Contents:
 
 *************
 rocSOLVER API
 *************
 
-This section provides details of the rocSOLVER library API as in release 
+This section provides details of the rocSOLVER library API as in release
 `ROCm 2.10 <https://github.com/ROCmSoftwarePlatform/rocSOLVER/tree/master-rocm-2.10>`_.
 
 
@@ -14,7 +14,7 @@ This section provides details of the rocSOLVER library API as in release
 Types
 =====
 
-Most rocSOLVER types are aliases of rocBLAS types. 
+Most rocSOLVER types are aliases of rocBLAS types.
 See rocBLAS types `here <https://rocblas.readthedocs.io/en/latest/api.html#types>`_.
 
 Definitions
@@ -312,7 +312,7 @@ rocsolver_<type>getrs_strided_batched()
 Auxiliaries
 =========================
 
-rocSOLVER auxiliary functions are aliases of rocBLAS auxiliary functions. See rocBLAS auxiliary functions 
+rocSOLVER auxiliary functions are aliases of rocBLAS auxiliary functions. See rocBLAS auxiliary functions
 `here <https://rocblas.readthedocs.io/en/latest/api.html#auxiliary>`_.
 
 rocSOLVER handle auxiliaries
diff --git a/ROCm_API_References/clBLAS.rst b/ROCm_API_References/clBLAS.rst
index 4ee29ec2..dacab15c 100644
--- a/ROCm_API_References/clBLAS.rst
+++ b/ROCm_API_References/clBLAS.rst
@@ -3,13 +3,13 @@
 clBLAS API Documentation
 =========================
 
-This is an implementation of Basic Linear Algebra Subprograms, levels 1, 2 and 3 using OpenCL and optimized for the AMD GPU hardware. 
+This is an implementation of Basic Linear Algebra Subprograms, levels 1, 2 and 3 using OpenCL and optimized for the AMD GPU hardware.
 
 *  `BLAS1 <http://clmathlibraries.github.io/clBLAS/group__BLAS1.html>`_
-    The Level 1 Basic Linear Algebra Subprograms are functions that perform vector-vector operations. 
+    The Level 1 Basic Linear Algebra Subprograms are functions that perform vector-vector operations.
 
 *  `BLAS2 <http://clmathlibraries.github.io/clBLAS/group__BLAS2.html>`_
-    The Level 2 Basic Linear Algebra Subprograms are functions that perform matrix-vector operations. 
+    The Level 2 Basic Linear Algebra Subprograms are functions that perform matrix-vector operations.
 
 *  `BLAS3 <http://clmathlibraries.github.io/clBLAS/group__BLAS3.html>`_
-    The Level 3 Basic Linear Algebra Subprograms are funcions that perform matrix-matrix operations. 
+    The Level 3 Basic Linear Algebra Subprograms are funcions that perform matrix-matrix operations.
diff --git a/ROCm_API_References/clSPARSE_API.rst b/ROCm_API_References/clSPARSE_API.rst
index ae5de082..7ae923ed 100644
--- a/ROCm_API_References/clSPARSE_API.rst
+++ b/ROCm_API_References/clSPARSE_API.rst
@@ -17,11 +17,11 @@ Routines to initialize a clsparse object
 
 .. doxygenfunction:: cldenseInitMatrix()
 
-.. doxygenfunction:: clsparseInitCooMatrix() 
+.. doxygenfunction:: clsparseInitCooMatrix()
 
-.. doxygenfunction:: clsparseInitCsrMatrix() 
+.. doxygenfunction:: clsparseInitCsrMatrix()
 
-.. doxygenfunction:: clsparseInitScalar() 
+.. doxygenfunction:: clsparseInitScalar()
 
 .. doxygenfunction:: clsparseInitScalar()
 
diff --git a/ROCm_API_References/clSPARSE_api.rst b/ROCm_API_References/clSPARSE_api.rst
index 7d65c9b2..f1f950c1 100644
--- a/ROCm_API_References/clSPARSE_api.rst
+++ b/ROCm_API_References/clSPARSE_api.rst
@@ -3,13 +3,13 @@
 clSPARSE API Documentation
 ==========================
 
-It is an OpenCL library implementing Sparse linear algebra routines. 
+It is an OpenCL library implementing Sparse linear algebra routines.
 
- * `Dense L1 BLAS operations <http://clmathlibraries.github.io/clSPARSE/group___b_l_a_s-1.html>`_	
+ * `Dense L1 BLAS operations <http://clmathlibraries.github.io/clSPARSE/group___b_l_a_s-1.html>`_
     Dense BLAS level 1 routines for dense vectors
- 
+
  * `Sparse L2 BLAS operations <http://clmathlibraries.github.io/clSPARSE/group___b_l_a_s-2.html>`_
     Sparse BLAS level 2 routines for sparse matrix dense vector
- 
+
  * `Sparse L3 BLAS operations <http://clmathlibraries.github.io/clSPARSE/group___b_l_a_s-3.html>`_
-    Sparse BLAS level 3 routines for sparse matrix dense matrix 
+    Sparse BLAS level 3 routines for sparse matrix dense matrix
diff --git a/ROCm_API_References/rocBLAS.rst b/ROCm_API_References/rocBLAS.rst
index 642579ca..daeea376 100644
--- a/ROCm_API_References/rocBLAS.rst
+++ b/ROCm_API_References/rocBLAS.rst
@@ -1,7 +1,7 @@
 .. _rocBLAS:
 
 ============
-rocBLAS 
+rocBLAS
 ============
 
 .. doxygenclass:: rocblas_handle
@@ -13,6 +13,6 @@ rocBLAS
    :members:
 
 
-.. doxygenfunction::  
+.. doxygenfunction::
    :project: rocBLAS
    :members:
diff --git a/ROCm_Audio_Video_Tutorials/ROCm_videos.rst b/ROCm_Audio_Video_Tutorials/ROCm_videos.rst
index c50099c0..8e41706f 100644
--- a/ROCm_Audio_Video_Tutorials/ROCm_videos.rst
+++ b/ROCm_Audio_Video_Tutorials/ROCm_videos.rst
@@ -1,7 +1,7 @@
 
-Slidecast: For AMD, It’s Time to ROCm!
+Slidecast: For AMD, It's Time to ROCm!
 https://youtu.be/LUAu4eywK5g
 
-Video: AMD ROC – Radeon Open Compute Platform 
+Video: AMD ROC - Radeon Open Compute Platform
 https://youtu.be/dnKDFci2x2Q
 
diff --git a/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst b/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst
index 000890c7..6c23240c 100644
--- a/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst
+++ b/ROCm_Compiler_SDK/ROCm-Codeobj-format.rst
@@ -113,27 +113,27 @@ The number of enabled registers must match value in compute_pgm_rsrc2.user_sgpr
 The following table defines SGPR registers that can be enabled and their order.
 
 ============ ============== ======================================= ==================================================================
-SGPR Order    Number 
+SGPR Order    Number
 	      of Registers 		Name 					Description
 ============ ============== ======================================= ==================================================================
 First 		  4 	       Private Segment Buffer 		     V# that can be used, together with Scratch Wave Offset as an
-			      (enable_sgpr_private_segment_buffer)   offset, to access the Private/Spill/Arg segments using a segment 							     		     address. CP uses the value from  
-								     amd_queue_t.scratch_resource_descriptor.	
+			      (enable_sgpr_private_segment_buffer)   offset, to access the Private/Spill/Arg segments using a segment 							     		     address. CP uses the value from
+								     amd_queue_t.scratch_resource_descriptor.
 
 then 		  2 		Dispatch Ptr			     64 bit address of AQL dispatch packet for kernel actually
-				(enable_sgpr_dispatch_ptr) 	     executing. 									    
+				(enable_sgpr_dispatch_ptr) 	     executing.
 
 then 		  2 		Queue Ptr 			     64 bit address of amd_queue_t object for AQL queue on which the
 				(enable_sgpr_queue_ptr) 	     dispatch packet was queued.
- 									    
+
 then 		  2 		Kernarg Segment Ptr 		     64 bit address of Kernarg segment. This is directly copied 				(enable_sgpr_kernarg_segment_ptr)    from the kernarg_address in the kernel dispatch packet. Having 									     CP load it once avoids loading it at the beginning of 									     every  wavefront.
 
 then 		  2 		Dispatch Id 			     64 bit Dispatch ID of the dispatch packet being executed.
-				(enable_sgpr_dispatch_id) 	     
+				(enable_sgpr_dispatch_id)
 
 then 		  2 		Flat Scratch Init		     Value used for FLAT_SCRATCH register initialization. Refer to
 				(enable_sgpr_flat_scratch_init)       Flat scratch for more information.
- 									     
+
 then 		 1 		Private Segment Size 		     The 32 bit byte size of a single work-items scratch memory
 				(enable_sgpr_private_segment_size)   allocation. This is the value from the kernel dispatch packet 									     Private Segment Byte Size rounded up by CP to a multiple of 									     WORD. Having CP load it once avoids loading it at the beginning 									     of every wavefront. Not used for GFX7/GFX8 since it is the same 									     value as the second SGPR of Flat Scratch Init.
 
@@ -144,15 +144,15 @@ then 		 1 		Grid Work-Group Count Y 		32 bit count of the number of work-groups
 then 		 1 		Grid Work-Group Count Z 		32 bit count of the number of work-groups in the Z dimension
 				(enable_sgpr_grid_workgroup_count_Z 	for the grid being executed. Computed from the fields in the
 				&& less than 16 previous SGPRs) 	kernel dispatch packet as ((grid_size.z + workgroup_size.z - 										1) / workgroupSize.z). Only initialized if <16 previous SGPRs 										initialized.
- 										 										
+
 then 		 1 		Work-Group Id X 			32 bit work group id in X dimension of grid for wavefront.
 				(enable_sgpr_workgroup_id_X) 		Always present.
 
-		
+
 then 		 1 		Work-Group Id Y 			32 bit work group id in Y dimension of grid for wavefront.
-				(enable_sgpr_workgroup_id_Y) 	
+				(enable_sgpr_workgroup_id_Y)
 
-then 		 1 		Work-Group Id Z 
+then 		 1 		Work-Group Id Z
 				(enable_sgpr_workgroup_id_Z) 		32 bit work group id in Z dimension of grid for wavefront. If 										present then Work-group Id Y will also be present.
 
 then 		 1 		Work-Group Info 			{first_wave, 14b0000, ordered_append_term[10:0],
@@ -160,7 +160,7 @@ then 		 1 		Work-Group Info 			{first_wave, 14b0000, ordered_append_term[10:0],
 
 then 		 1 	     |  Private Segment Wave Byte Offset 	32 bit byte offset from base of scratch base of queue the  			     	     |  (enable_sgpr_private_segment_wave	executing  kernel dispatch. Must be used as an offset with 				     |  _byte_offset)			      Private/Spill/Arg  segment address when using Scratch Segment
 				    				      Buffer. It must be added to Flat Scratch Offset if setting up 									      FLAT SCRATCH for flat addressing.
-		            		 
+
 ============ ============== ======================================= ==================================================================
 
 VGPR register numbers used for enabled registers are dense starting at VGPR0: the first enabled register is VGPR0, the next enabled register is VGPR1 etc.; disabled registers do not have a VGPR number.
@@ -423,7 +423,7 @@ Memory Fence 	scacq 			agent+ 		memfence; s_waitcnt 0; buffer_wbinvl1_vol
 Memory Fence 	screl 			agent+ 		s_waitcnt 0; memfence
 Memory Fence 	scar 			agent + 	memfence; s_waitcnt 0; buffer_wbinvl1_vol
 ============== ==================== ================= ========================================================
-	
+
 .. _Instruction-set-architecture:
 
 Instruction set architecture
@@ -450,7 +450,7 @@ AMD 	  AMDGPU 	  8 	  0 	   1 	   GFX8, XNACK enabled 	           A10-8700 serie
 
 AMD 	  AMDGPU 	  8 	  0 	   2       GFX8, SPI register limitation  FirePro S7150, S7100, W7100; Radeon R285, R9 380,
 						   XNACK disabled,		    R9 385; Mobile FirePro M7170
-						   PCIe Gen3 atomics 	
+						   PCIe Gen3 atomics
 
 AMD 	  AMDGPU 	  8 	 0 	   3 	   GFX8, XNACK disabled,	  Radeon R9 Nano, R9 Fury, R9 FuryX, Pro Duo, RX 460,
 						    PCIe Gen3 atomics 		  RX 470, RX 480; FirePro S9300x2
@@ -458,7 +458,7 @@ AMD 	  AMDGPU 	  8 	 0 	   3 	   GFX8, XNACK disabled,	  Radeon R9 Nano, R9 Fury
 AMD 	  AMDGPU 	  8 	 0 	   4 	   GFX8, -XNACK Legacy, 	   Radeon R9 Nano, R9 Fury, R9 FuryX, Pro Duo,
 										    RX 460, RX 470, RX 480; FirePro S9300x2
 
-AMD 	  AMDGPU 	  9 	 0 	   0 	   GFX9, -XNACK 	
+AMD 	  AMDGPU 	  9 	 0 	   0 	   GFX9, -XNACK
 
 AMD 	  AMDGPU 	  9 	 0 	   1 	   GFX9, +XNACK
 ======= ============== ======= ======= ========== ============================== =====================================================
@@ -467,7 +467,7 @@ AMD 	  AMDGPU 	  9 	 0 	   1 	   GFX9, +XNACK
 
 AMD Kernel Code
 ###################
-AMD Kernel Code object is used by AMD GPU CP to set up the hardware to execute a kernel dispatch and consists of the meta data needed to initiate the execution of a kernel, including the entry point address of the machine code that implements 
+AMD Kernel Code object is used by AMD GPU CP to set up the hardware to execute a kernel dispatch and consists of the meta data needed to initiate the execution of a kernel, including the entry point address of the machine code that implements
 
 
 
@@ -693,7 +693,7 @@ AMD_FLOAT_ROUND_MODE_ZERO 		3 		Round Toward 0
 ====================================== ========= =====================================================================
 
 .. _Denorm-Mode:
-   
+
 Denorm Mode amd_float_denorm_mode_t
 ######################################
 
@@ -973,7 +973,7 @@ References
        * `AMD_Southern_Islands_Instruction_Set_Architecture <http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/07/AMD_Southern_Islands_Instruction_Set_Architecture1.pdf>`_
    * `ROCR Runtime sources <https://github.com/RadeonOpenCompute/ROCR-Runtime>`_
        * `amd_hsa_kernel_code.h <https://github.com/RadeonOpenCompute/ROCR-Runtime/blob/master/src/inc/amd_hsa_kernel_code.h>`_
-       * `amd_hsa_queue.h <https://github.com/RadeonOpenCompute/ROCR-Runtime/blob/master/src/inc/amd_hsa_queue.h>`_ 
+       * `amd_hsa_queue.h <https://github.com/RadeonOpenCompute/ROCR-Runtime/blob/master/src/inc/amd_hsa_queue.h>`_
        * `amd_hsa_signal.h <https://github.com/RadeonOpenCompute/ROCR-Runtime/blob/master/src/inc/amd_hsa_signal.h>`_
        * `amd_hsa_common.h <https://github.com/RadeonOpenCompute/ROCR-Runtime/blob/master/src/inc/amd_hsa_common.h>`_
    * `PCI Express Atomic Operations <https://pcisig.com/specifications/pciexpress/specifications/ECN_Atomic_Ops_080417.pdf>`_
diff --git a/ROCm_Compiler_SDK/ROCm-Compiler-SDK.rst b/ROCm_Compiler_SDK/ROCm-Compiler-SDK.rst
index 0ee50d42..52ac1e00 100644
--- a/ROCm_Compiler_SDK/ROCm-Compiler-SDK.rst
+++ b/ROCm_Compiler_SDK/ROCm-Compiler-SDK.rst
@@ -61,7 +61,7 @@ Use the following commands:
       -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" \
       ../llvm
    make
-            
+
 
 To build the library bitcodes, clone the amd_stg_open branch of this repository.
 Run the following commands:
@@ -189,7 +189,7 @@ Programmers should consult the HSA Runtime Programmer's Reference Manual for a f
 
 Known issues
 **************
- 
+
   *  Each HSA process creates an internal DMA queue, but there is a system-wide limit of four DMA queues. When the limit is reached HSA processes will use internal kernels for copies.
 
 **Disclaimer**
diff --git a/ROCm_Compiler_SDK/ROCm-Native-ISA.rst b/ROCm_Compiler_SDK/ROCm-Native-ISA.rst
index e224d4d3..523bd611 100644
--- a/ROCm_Compiler_SDK/ROCm-Native-ISA.rst
+++ b/ROCm_Compiler_SDK/ROCm-Native-ISA.rst
@@ -90,7 +90,7 @@ GCN Native ISA LLVM Code Generator
                * :ref:`.amdgpu_metadata`
         * :ref:`Code Object V3 Example Source Code (-mattr=+code-object-v3)`
         * :ref:`Additional Documentation`
-               
+
 
 .. _Introductio:
 
@@ -130,7 +130,7 @@ Use the clang -target <Architecture>-<Vendor>-<OS>-<Environment> option to speci
  OS      	Description
 ============== ==========================================================================================
  <empty> 	Defaults to the unknown OS.
-amdhsa 	        Compute kernels executed on HSA [HSA] compatible runtimes such as AMD’s ROCm [AMD-ROCm].
+amdhsa 	        Compute kernels executed on HSA [HSA] compatible runtimes such as AMD's ROCm [AMD-ROCm].
 amdpal 	        Graphic shaders and compute kernels executed on AMD PAL runtime.
 mesa3d 	        Graphic shaders and compute kernels executed on Mesa 3D runtime.
 ============== ==========================================================================================
@@ -343,10 +343,10 @@ Use the clang -mcpu <Processor> option to specify the AMD GPU processor. The nam
 |           |             |              |       | cumode          |         |                      |
 |           |             |              |       | [off]           |         |                      |
 +-----------+-------------+--------------+-------+-----------------+---------+----------------------+
- 
 
-.. _Target Features:  
-     
+
+.. _Target Features:
+
 Target Features
 -----------------
 
@@ -362,32 +362,32 @@ For example:
     Enable the xnack feature.
 -mno-xnack
     Disable the xnack feature.
-   
-  **AMDGPU Target Features** 
+
+  **AMDGPU Target Features**
 =================  ============================================================================
  Target Feature 	              Description
 =================  ============================================================================
- -m[no-]xnack 	   Enable/disable generating code that has memory clauses that are compatible 
+ -m[no-]xnack 	   Enable/disable generating code that has memory clauses that are compatible
                    with having XNACK replay enabled.
                    This is used for demand paging and page migration. If XNACK replay is
-                   enabled in the device, then if a page fault occurs the code may execute 
+                   enabled in the device, then if a page fault occurs the code may execute
                    incorrectly if the xnack feature is not enabled. Executing code that has
                    the feature enabled on a device that does not have XNACK replay enabled will
-                   execute correctly, but may be less performant than code with the feature 
+                   execute correctly, but may be less performant than code with the feature
                    disabled.
 
  -m[no-]sram-ecc   Enable/disable generating code that assumes SRAM ECC is enabled/disabled.
 
  -m[no-]wavefront
   size64 	   Control the default wavefront size used when generating code for kernels.
-                   When disabled native wavefront size 32 is used, when enabled wavefront 
+                   When disabled native wavefront size 32 is used, when enabled wavefront
                    size 64 is used.
- -m[no-]cumode     Control the default wavefront execution mode used when generating code 
+ -m[no-]cumode     Control the default wavefront execution mode used when generating code
                    for kernels. When disabled native WGP wavefront execution mode is used,
                    when enabled CU wavefront execution mode is used (see Memory Model).
-=================  ============================================================================    	
-    
-  
+=================  ============================================================================
+
+
 
 .. _Address-Spaces:
 
@@ -402,9 +402,9 @@ LLVM Address Space number is used throughout LLVM (for example, in LLVM IR).
 
 **Address Space Mapping**
 
-====================== ===================    
+====================== ===================
   LLVM Address Space 	Memory Space
-====================== ===================    
+====================== ===================
               0 	Generic (Flat)
               1 	Global
               2 	Region (GDS)
@@ -414,9 +414,9 @@ LLVM Address Space number is used throughout LLVM (for example, in LLVM IR).
               6 	Constant 32-bit
               7 	Buffer Fat Pointer
                         (experimental)
-====================== ===================    
+====================== ===================
 
-The buffer fat pointer is an experimental address space that is currently unsupported in the backend. It exposes a non-integral pointer that is in future intended to support the modelling of 128-bit buffer descriptors + a 32-bit offset into the buffer descriptor (in total encapsulating a 160-bit ‘pointer’), allowing us to use normal LLVM load/store/atomic operations to model the buffer descriptors used heavily in graphics workloads targeting the backend.
+The buffer fat pointer is an experimental address space that is currently unsupported in the backend. It exposes a non-integral pointer that is in future intended to support the modelling of 128-bit buffer descriptors + a 32-bit offset into the buffer descriptor (in total encapsulating a 160-bit 'pointer'), allowing us to use normal LLVM load/store/atomic operations to model the buffer descriptors used heavily in graphics workloads targeting the backend.
 
 .. _Memory-Scopes:
 
@@ -429,33 +429,33 @@ The memory model supported is based on the HSA memory model  which is based in t
 
 This is different to the OpenCL memory model which does not have scope inclusion and requires the memory scopes to exactly match. However, this is conservatively correct for OpenCL.
 
-    **AMDHSA LLVM Sync Scopes** 	
-================   =================================================================================================================  
+    **AMDHSA LLVM Sync Scopes**
+================   =================================================================================================================
 LLVM Sync Scope 	Description
-================   =================================================================================================================  
+================   =================================================================================================================
 none 		      The default: system.
-		      Synchronizes with, and participates in modification and seq_cst total orderings with, other operations (except 			      image operations) for all address spaces (except private, or generic that accesses private) provided the other 			      operation’s sync scope is:
+		      Synchronizes with, and participates in modification and seq_cst total orderings with, other operations (except 			      image operations) for all address spaces (except private, or generic that accesses private) provided the other 			      operation's sync scope is:
     			* system.
     			* agent and executed by a thread on the same agent.
     			* workgroup and executed by a thread in the same workgroup.
    			* wavefront and executed by a thread in the same wavefront.
 
-agent 		     Synchronizes with, and participates in modification and seq_cst total orderings with, other operations (except 			     image operations) for all address spaces (except private, or generic that accesses private) provided the other 			     operation’s sync scope is:
+agent 		     Synchronizes with, and participates in modification and seq_cst total orderings with, other operations (except 			     image operations) for all address spaces (except private, or generic that accesses private) provided the other 			     operation's sync scope is:
 			* system or agent and executed by a thread on the same agent.
     			* workgroup and executed by a thread in the same workgroup.
    			* wavefront and executed by a thread in the same wavefront.
 
-workgroup 	     Synchronizes with, and participates in modification and seq_cst total orderings with, other operations (except 			     image operations) for all address spaces (except private, or generic that accesses private) provided the other 			     operation’s sync scope is:
+workgroup 	     Synchronizes with, and participates in modification and seq_cst total orderings with, other operations (except 			     image operations) for all address spaces (except private, or generic that accesses private) provided the other 			     operation's sync scope is:
 			* system, agent or workgroup and executed by a thread in the same workgroup.
    			* wavefront and executed by a thread in the same wavefront.
 
-wavefront            Synchronizes with, and participates in modification and seq_cst total orderings with, other operations (except 			     image operations) for all address spaces (except private, or generic that accesses private) provided the other 			     operation’s sync scope is:
+wavefront            Synchronizes with, and participates in modification and seq_cst total orderings with, other operations (except 			     image operations) for all address spaces (except private, or generic that accesses private) provided the other 			     operation's sync scope is:
 			* system, agent, workgroup or wavefront and executed by a thread in the same wavefront.
 
 singlethread 	     Only synchronizes with, and participates in modification and seq_cst total orderings with, other operations (except 	              image operations) running in the same thread for all address spaces (for example, in signal handlers).
 one-as               Same as system but only synchronizes with other operations within the same address space
 
-================   =================================================================================================================  
+================   =================================================================================================================
 
 
 .. _AMDGPU-Intrinsics:
@@ -474,22 +474,22 @@ AMDGPU Attributes
 
 The AMDGPU backend supports the following LLVM IR attributes.
 
-    **AMDGPU LLVM IR Attributes** 
+    **AMDGPU LLVM IR Attributes**
 ============================================   =============================================================================================
  LLVM Attribute 	                         Description
 ============================================   =============================================================================================
-“amdgpu-flat-work-group-size”=”min,max” 	 Specify the minimum and maximum flat work group sizes that will be specified
-                                                 when the kernel is dispatched. Generated by the amdgpu_flat_work_group_size 
+"amdgpu-flat-work-group-size"="min,max" 	 Specify the minimum and maximum flat work group sizes that will be specified
+                                                 when the kernel is dispatched. Generated by the amdgpu_flat_work_group_size
                                                  CLANG attribute.
-“amdgpu-implicitarg-num-bytes”=”n” 	         Number of kernel argument bytes to add to the kernel argument block size 
+"amdgpu-implicitarg-num-bytes"="n" 	         Number of kernel argument bytes to add to the kernel argument block size
                                                  for the implicit arguments. This varies by OS and language
-“amdgpu-num-sgpr”=”n” 	                         Specifies the number of SGPRs to use. Generated by the amdgpu_num_sgpr CLANG attribute   
-“amdgpu-num-vgpr”=”n” 	                         Specifies the number of VGPRs to use. Generated by the amdgpu_num_vgpr CLANG attribute   
-“amdgpu-waves-per-eu”=”m,n” 	                 Specify the minimum and maximum number of waves per execution unit.
+"amdgpu-num-sgpr"="n" 	                         Specifies the number of SGPRs to use. Generated by the amdgpu_num_sgpr CLANG attribute
+"amdgpu-num-vgpr"="n" 	                         Specifies the number of VGPRs to use. Generated by the amdgpu_num_vgpr CLANG attribute
+"amdgpu-waves-per-eu"="m,n" 	                 Specify the minimum and maximum number of waves per execution unit.
                                                  Generated by the amdgpu_waves_per_eu CLANG attribute
-“amdgpu-ieee” true/false. 	                 Specify whether the function expects the IEEE field of the mode register to  
+"amdgpu-ieee" true/false. 	                 Specify whether the function expects the IEEE field of the mode register to
                                                  be set on entry. Overrides the default for the calling convention.
-“amdgpu-dx10-clamp” true/false. 	         Specify whether the function expects the DX10_CLAMP field of the mode
+"amdgpu-dx10-clamp" true/false. 	         Specify whether the function expects the DX10_CLAMP field of the mode
                                                  register to be set on entry. Overrides the default for the calling convention.
 ============================================   =============================================================================================
 
@@ -522,7 +522,7 @@ The AMDGPU backend uses the following ELF header:
 =========================== ===================================
 
     **AMDGPU ELF Header Enumeration Values**
-    
+
 ========================== ===============
  Name 			             Value
 ========================== ===============
@@ -569,8 +569,8 @@ Sections
 
 An AMDGPU target ELF code object has the standard ELF sections which include:
 
-    **AMDGPU ELF Sections** 
-    
+    **AMDGPU ELF Sections**
+
 =============== ================ ====================================
 Name 		     Type			Attributes
 =============== ================ ====================================
@@ -605,7 +605,7 @@ These sections have their standard meanings and are only generated if needed.
 
 .relaname, .rela.dyn
     For relocatable code objects, name is the name of the section that the relocation records apply. For example, .rela.text is the section name for relocation records associated with the .text section.
-    For linked shared code objects, .rela.dyn contains all the relocation records from each of the relocatable code object’s .relaname sections.
+    For linked shared code objects, .rela.dyn contains all the relocation records from each of the relocatable code object's .relaname sections.
     See Relocation Records for the relocation records supported by the AMDGPU backend.
 
 .text
@@ -618,18 +618,18 @@ Note Records
 
 As required by ELFCLASS64, minimal zero byte padding must be generated after the name field to ensure the desc field is 4 byte aligned. In addition, minimal zero byte padding must be generated to ensure the desc field size is a multiple of 4 bytes. The sh_addralign field of the .note section must be at least 4 to indicate at least 8 byte alignment.
 
-The AMDGPU backend code object uses the following ELF note records in the .note section. The Description column specifies the layout of the note record’s desc field. All fields are consecutive bytes. Note records with variable size strings have a corresponding *_size field that specifies the number of bytes, including the terminating null character, in the string. The string(s) come immediately after the preceding fields.
+The AMDGPU backend code object uses the following ELF note records in the .note section. The Description column specifies the layout of the note record's desc field. All fields are consecutive bytes. Note records with variable size strings have a corresponding *_size field that specifies the number of bytes, including the terminating null character, in the string. The string(s) come immediately after the preceding fields.
 
 Additional note records can be present.
 
 				**AMDGPU ELF Note Records**
-		
-================ ============================== ========================================== 
+
+================ ============================== ==========================================
    Name  	       Type 		                 Description
-================ ============================== ========================================== 
- “AMD” 	           NT_AMD_AMDGPU_HSA_METADATA 	  <metadata null terminated string>
- “AMD” 	           NT_AMD_AMDGPU_ISA 		      <isa name null terminated string>
-================ ============================== ========================================== 
+================ ============================== ==========================================
+ "AMD" 	           NT_AMD_AMDGPU_HSA_METADATA 	  <metadata null terminated string>
+ "AMD" 	           NT_AMD_AMDGPU_ISA 		      <isa name null terminated string>
+================ ============================== ==========================================
 
 
 
@@ -637,7 +637,7 @@ Additional note records can be present.
 	**AMDGPU ELF Note Record Enumeration Values**
 ============================= ==================
 Name			            Value
-============================= ================== 
+============================= ==================
 reserved 		               0-9
 NT_AMD_AMDGPU_HSA_METADATA   	     10
 NT_AMD_AMDGPU_ISA	               11
@@ -658,7 +658,7 @@ NT_AMD_AMDGPU_ISA
     where:
 
         ``architecture``
-           The architecture from table AMDGPU Target Triples. 
+           The architecture from table AMDGPU Target Triples.
            This is always amdgcn when the target triple OS is amdhsa (see Target Triples).
 
         ``vendor``
@@ -667,22 +667,22 @@ NT_AMD_AMDGPU_ISA
 
         ``OS``
            The OS from table AMDGPU Target Triples.
-        
+
         ``environment``
            An environment from table AMDGPU Target Triples, or blank if the environment has no affect on the execution of the code 		   object.
            For the AMDGPU backend this is currently always blank.
-   	
+
         ``processor``
            The processor from table AMDGPU Processors.
 
     For example::
-        
+
         amdgcn-amd-amdhsa--gfx901
 
 
 ``NT_AMD_AMDGPU_HSA_METADATA``
 
-    Specifies extensible metadata associated with the code objects executed on HSA [HSA] compatible runtimes such as AMD’s ROCm [AMD-ROCm]. It is required when the target triple OS is amdhsa (see Target Triples). See Code Object Metadata for the syntax of the code object metadata string.
+    Specifies extensible metadata associated with the code objects executed on HSA [HSA] compatible runtimes such as AMD's ROCm [AMD-ROCm]. It is required when the target triple OS is amdhsa (see Target Triples). See Code Object Metadata for the syntax of the code object metadata string.
 
 .. _Symbols:
 
@@ -692,7 +692,7 @@ Symbols
 Symbols include the following:
 
     **AMDGPU ELF Symbols**
-    
+
 +----------------+------------+-----------+--------------------+
 | Name           | Type       | Section   | Description        |
 +================+============+===========+====================+
@@ -744,7 +744,7 @@ Following notations are used for specifying relocation calculations:
 **A**
     Represents the addend used to compute the value of the relocatable field.
 **G**
-    Represents the offset into the global offset table at which the relocation entry’s symbol will reside during execution.
+    Represents the offset into the global offset table at which the relocation entry's symbol will reside during execution.
 **GOT**
     Represents the address of the global offset table.
 **P**
@@ -784,7 +784,7 @@ The following relocation types are supported:
 | R_AMDGPU_REL32_HI      | 11    | word32 | (S + A - P) >> 32              |
 +------------------------+-------+--------+--------------------------------+
 
- 
+
 .. _DWARF:
 
 DWARF
@@ -798,7 +798,7 @@ Address Space Mapping
 The following address space mapping is used:
 
 		AMDGPU DWARF Address Space Mapping
-======================== ========================		
+======================== ========================
  DWARF Address Space   	   Memory Space
 ======================== ========================
     1 			       Private (Scratch)
@@ -847,7 +847,7 @@ This section provides code conventions used when the target triple OS is amdhsa
 
 Code Object Metadata
 +++++++++++++++++++++
-The code object metadata specifies extensible metadata associated with the code objects executed on HSA [HSA] compatible runtimes such as AMD’s ROCm [AMD-ROCm]. It is specified by the NT_AMD_AMDGPU_HSA_METADATA note record (see Note Records) and is required when the target triple OS is amdhsa (see Target Triples). It must contain the minimum information necessary to support the ROCM kernel queries. For example, the segment sizes needed in a dispatch packet. In addition, a high level language runtime may require other information to be included. For example, the AMD OpenCL runtime records kernel argument information.
+The code object metadata specifies extensible metadata associated with the code objects executed on HSA [HSA] compatible runtimes such as AMD's ROCm [AMD-ROCm]. It is specified by the NT_AMD_AMDGPU_HSA_METADATA note record (see Note Records) and is required when the target triple OS is amdhsa (see Target Triples). It must contain the minimum information necessary to support the ROCM kernel queries. For example, the segment sizes needed in a dispatch packet. In addition, a high level language runtime may require other information to be included. For example, the AMD OpenCL runtime records kernel argument information.
 
 The metadata is specified as a YAML formatted string (see [YAML] and YAML I/O).
 
@@ -855,19 +855,19 @@ The metadata is represented as a single YAML document comprised of the mapping d
 
 For boolean values, the string values of false and true are used for false and true respectively.
 
-Additional information can be added to the mappings. To avoid conflicts, any non-AMD key names should be prefixed by “vendor-name.”.
+Additional information can be added to the mappings. To avoid conflicts, any non-AMD key names should be prefixed by "vendor-name.".
 
     AMDHSA Code Object Metadata Mapping
 +------------+------------------------+-----------+------------------------------------------------------------------------------------------------------------------------------------------------+
 | String Key | Value Type             | Required? | Description                                                                                                                                    |
 +============+========================+===========+================================================================================================================================================+
-| “Version”  | sequence of 2 integers | Required  | * The first integer is the major version. Currently 1.                                                                                         |
+| "Version"  | sequence of 2 integers | Required  | * The first integer is the major version. Currently 1.                                                                                         |
 |            |                        |           | * The second integer is the minor version. Currently 0.                                                                                        |
 +------------+------------------------+-----------+------------------------------------------------------------------------------------------------------------------------------------------------+
-| “Printf”   | sequence of strings    |           | Each string is encoded information about a printf function call.                                                                               |
+| "Printf"   | sequence of strings    |           | Each string is encoded information about a printf function call.                                                                               |
 |            |                        |           | The encoded information is organized as fields separated by colon                                                                              |
 |            |                        |           |                                                                                                                                                |
-|            |                        |           | (‘:’):ID:N:S[0]:S[1]:...:S[N-1]:FormatString                                                                                                   |
+|            |                        |           | (':'):ID:N:S[0]:S[1]:...:S[N-1]:FormatString                                                                                                   |
 |            |                        |           |                                                                                                                                                |
 |            |                        |           | where:                                                                                                                                         |
 |            |                        |           | ID                                                                                                                                             |
@@ -880,7 +880,7 @@ Additional information can be added to the mappings. To avoid conflicts, any non
 |            |                        |           | FormatString                                                                                                                                   |
 |            |                        |           | The format string passed to the printf function call.                                                                                          |
 +------------+------------------------+-----------+------------------------------------------------------------------------------------------------------------------------------------------------+
-| “Kernels”  | sequence of mapping    | Required  | Sequence of the mappings for each kernel in the code object. See AMDHSA Code Object Kernel Metadata Mapping for the definition of the mapping. |
+| "Kernels"  | sequence of mapping    | Required  | Sequence of the mappings for each kernel in the code object. See AMDHSA Code Object Kernel Metadata Mapping for the definition of the mapping. |
 +------------+------------------------+-----------+------------------------------------------------------------------------------------------------------------------------------------------------+
 
 
@@ -891,26 +891,26 @@ Additional information can be added to the mappings. To avoid conflicts, any non
 +-------------------+------------------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------+
 | String Key        | value Type             | Required? | Description                                                                                                                                        |
 +===================+========================+===========+====================================================================================================================================================+
-| “Name”            | string                 | Required  | Source name of the kernel.                                                                                                                         |
+| "Name"            | string                 | Required  | Source name of the kernel.                                                                                                                         |
 +-------------------+------------------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------+
-| “SymbolName”      | string                 | Required  | Name of the kernel descriptor ELF symbol.                                                                                                          |
+| "SymbolName"      | string                 | Required  | Name of the kernel descriptor ELF symbol.                                                                                                          |
 +-------------------+------------------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------+
-| “Language”        | string                 |           | Source language of the kernel. Values include:                                                                                                     |
-|                   |                        |           | * “OpenCL C”                                                                                                                                       |
-|                   |                        |           | * “OpenCL C++”                                                                                                                                     |
-|                   |                        |           | * “HCC”                                                                                                                                            |
-|                   |                        |           | * “OpenMP”                                                                                                                                         |
+| "Language"        | string                 |           | Source language of the kernel. Values include:                                                                                                     |
+|                   |                        |           | * "OpenCL C"                                                                                                                                       |
+|                   |                        |           | * "OpenCL C++"                                                                                                                                     |
+|                   |                        |           | * "HCC"                                                                                                                                            |
+|                   |                        |           | * "OpenMP"                                                                                                                                         |
 +-------------------+------------------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------+
-| “LanguageVersion” | sequence of 2 integers |           | * The first integer is the major version.                                                                                                          |
+| "LanguageVersion" | sequence of 2 integers |           | * The first integer is the major version.                                                                                                          |
 |                   |                        |           | * The second integer is the minor version.                                                                                                         |
 +-------------------+------------------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------+
-| “Attrs”           | mapping                |           | Mapping of kernel attributes. See AMDHSA Code Object Kernel Attribute Metadata Mapping for the mapping definition.                                 |
+| "Attrs"           | mapping                |           | Mapping of kernel attributes. See AMDHSA Code Object Kernel Attribute Metadata Mapping for the mapping definition.                                 |
 +-------------------+------------------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------+
-| “Arguments”       | sequence of mapping    |           | Sequence of mappings of the kernel arguments. See AMDHSA Code Object Kernel Argument Metadata Mapping for the definition of the mapping.           |
+| "Arguments"       | sequence of mapping    |           | Sequence of mappings of the kernel arguments. See AMDHSA Code Object Kernel Argument Metadata Mapping for the definition of the mapping.           |
 +-------------------+------------------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------+
-| “CodeProps”       | mapping                |           | Mapping of properties related to the kernel code. See AMDHSA Code Object Kernel Code Properties Metadata Mapping for the mapping definition.       |
+| "CodeProps"       | mapping                |           | Mapping of properties related to the kernel code. See AMDHSA Code Object Kernel Code Properties Metadata Mapping for the mapping definition.       |
 +-------------------+------------------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------+
-| “DebugProps”      | mapping                |           | Mapping of properties related to the kernel debugging. See AMDHSA Code Object Kernel Debug Properties Metadata Mapping for the mapping definition. |
+| "DebugProps"      | mapping                |           | Mapping of properties related to the kernel debugging. See AMDHSA Code Object Kernel Debug Properties Metadata Mapping for the mapping definition. |
 +-------------------+------------------------+-----------+----------------------------------------------------------------------------------------------------------------------------------------------------+
 
 
@@ -922,152 +922,152 @@ Additional information can be added to the mappings. To avoid conflicts, any non
 +---------------------+------------------------+-----------+-----------------------------------------------------------------------------+
 | String Key          | Value Type             | Required? | Description                                                                 |
 +=====================+========================+===========+=============================================================================+
-| “ReqdWorkGroupSize” | sequence of 3 integers |           | The dispatch work-group size X,Y,Z must correspond to the specified values. |
+| "ReqdWorkGroupSize" | sequence of 3 integers |           | The dispatch work-group size X,Y,Z must correspond to the specified values. |
 |                     |                        |           | Corresponds to the OpenCL reqd_work_group_size attribute.                   |
 +---------------------+------------------------+-----------+-----------------------------------------------------------------------------+
-| “WorkGroupSizeHint” | sequence of 3 integers |           | The dispatch work-group size X,Y,Z is likely to be the specified values.    |
+| "WorkGroupSizeHint" | sequence of 3 integers |           | The dispatch work-group size X,Y,Z is likely to be the specified values.    |
 |                     |                        |           | Corresponds to the OpenCL work_group_size_hint attribute.                   |
 +---------------------+------------------------+-----------+-----------------------------------------------------------------------------+
-| “VecTypeHint”       | string                 |           | The name of a scalar or vector type.                                        |
+| "VecTypeHint"       | string                 |           | The name of a scalar or vector type.                                        |
 |                     |                        |           | Corresponds to the OpenCL vec_type_hint attribute.                          |
 +---------------------+------------------------+-----------+-----------------------------------------------------------------------------+
 
 
-   
-   
+
+
    **AMDHSA Code Object Kernel Argument Metadata Mapping**
-   
-   
+
+
 +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | String Key      | Value Type | Required? | Description                                                                                                                                                                                                                                                                                                                                       |
 +=================+============+===========+===================================================================================================================================================================================================================================================================================================================================================+
-| “Name”          | string     |           | Kernel argument name.                                                                                                                                                                                                                                                                                                                             |
+| "Name"          | string     |           | Kernel argument name.                                                                                                                                                                                                                                                                                                                             |
 +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| “TypeName”      | string     |           | Kernel argument type name.                                                                                                                                                                                                                                                                                                                        |
+| "TypeName"      | string     |           | Kernel argument type name.                                                                                                                                                                                                                                                                                                                        |
 +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| “Size”          | integer    | Required  | Kernel argument size in bytes.                                                                                                                                                                                                                                                                                                                    |
+| "Size"          | integer    | Required  | Kernel argument size in bytes.                                                                                                                                                                                                                                                                                                                    |
 +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| “Align”         | integer    | Required  | Kernel argument alignment in bytes. Must be a power of two.                                                                                                                                                                                                                                                                                       |
+| "Align"         | integer    | Required  | Kernel argument alignment in bytes. Must be a power of two.                                                                                                                                                                                                                                                                                       |
 +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| “ValueKind”     | string     | Required  | Kernel argument kind that specifies how to set up the corresponding argument. Values include :                                                                                                                                                                                                                                                    |
-|                 |            |           |  “ByValue”                                                                                                                                                                                                                                                                                                                                        |
+| "ValueKind"     | string     | Required  | Kernel argument kind that specifies how to set up the corresponding argument. Values include :                                                                                                                                                                                                                                                    |
+|                 |            |           |  "ByValue"                                                                                                                                                                                                                                                                                                                                        |
 |                 |            |           |     The argument is copied directly into the kernarg.                                                                                                                                                                                                                                                                                             |
-|                 |            |           |  “GlobalBuffer”                                                                                                                                                                                                                                                                                                                                   |
+|                 |            |           |  "GlobalBuffer"                                                                                                                                                                                                                                                                                                                                   |
 |                 |            |           |     A global address space pointer to the buffer data is passed in the kernarg.                                                                                                                                                                                                                                                                   |
-|                 |            |           |  “DynamicSharedPointer”                                                                                                                                                                                                                                                                                                                           |
+|                 |            |           |  "DynamicSharedPointer"                                                                                                                                                                                                                                                                                                                           |
 |                 |            |           |     A group address space pointer to dynamically allocated LDS is passed in the kernarg.                                                                                                                                                                                                                                                          |
-|                 |            |           |  “Sampler”                                                                                                                                                                                                                                                                                                                                        |
+|                 |            |           |  "Sampler"                                                                                                                                                                                                                                                                                                                                        |
 |                 |            |           |     A global address space pointer to a S# is passed in the kernarg.                                                                                                                                                                                                                                                                              |
-|                 |            |           |  “Image”                                                                                                                                                                                                                                                                                                                                          |
+|                 |            |           |  "Image"                                                                                                                                                                                                                                                                                                                                          |
 |                 |            |           |     A global address space pointer to a T# is passed in the kernarg.                                                                                                                                                                                                                                                                              |
-|                 |            |           |  “Pipe”                                                                                                                                                                                                                                                                                                                                           |
+|                 |            |           |  "Pipe"                                                                                                                                                                                                                                                                                                                                           |
 |                 |            |           |     A global address space pointer to an OpenCL pipe is passed in the kernarg.                                                                                                                                                                                                                                                                    |
-|                 |            |           |  “Queue”                                                                                                                                                                                                                                                                                                                                          |
+|                 |            |           |  "Queue"                                                                                                                                                                                                                                                                                                                                          |
 |                 |            |           |     A global address space pointer to an OpenCL device enqueue queue is passed in the kernarg.                                                                                                                                                                                                                                                    |
-|                 |            |           |  “HiddenGlobalOffsetX”                                                                                                                                                                                                                                                                                                                            |
+|                 |            |           |  "HiddenGlobalOffsetX"                                                                                                                                                                                                                                                                                                                            |
 |                 |            |           |     The OpenCL grid dispatch global offset for the X dimension is passed in the kernarg.                                                                                                                                                                                                                                                          |
-|                 |            |           |  “HiddenGlobalOffsetY”                                                                                                                                                                                                                                                                                                                            |
+|                 |            |           |  "HiddenGlobalOffsetY"                                                                                                                                                                                                                                                                                                                            |
 |                 |            |           |     The OpenCL grid dispatch global offset for the Y dimension is passed in the kernarg.                                                                                                                                                                                                                                                          |
-|                 |            |           |  “HiddenGlobalOffsetZ”                                                                                                                                                                                                                                                                                                                            |
+|                 |            |           |  "HiddenGlobalOffsetZ"                                                                                                                                                                                                                                                                                                                            |
 |                 |            |           |     The OpenCL grid dispatch global offset for the Z dimension is passed in the kernarg.                                                                                                                                                                                                                                                          |
-|                 |            |           |  “HiddenNone”                                                                                                                                                                                                                                                                                                                                     |
+|                 |            |           |  "HiddenNone"                                                                                                                                                                                                                                                                                                                                     |
 |                 |            |           |     An argument that is not used by the kernel. Space needs to be left for it, but it does not need to be set up.                                                                                                                                                                                                                                 |
-|                 |            |           |  “HiddenPrintfBuffer”                                                                                                                                                                                                                                                                                                                             |
+|                 |            |           |  "HiddenPrintfBuffer"                                                                                                                                                                                                                                                                                                                             |
 |                 |            |           |     A global address space pointer to the runtime printf buffer is passed in kernarg.                                                                                                                                                                                                                                                             |
-|                 |            |           |  “HiddenDefaultQueue”                                                                                                                                                                                                                                                                                                                             |
+|                 |            |           |  "HiddenDefaultQueue"                                                                                                                                                                                                                                                                                                                             |
 |                 |            |           |     A global address space pointer to the OpenCL device enqueue queue that should be used by the kernel by default is passed in the kernarg.                                                                                                                                                                                                      |
-|                 |            |           |  “HiddenCompletionAction”                                                                                                                                                                                                                                                                                                                         |
+|                 |            |           |  "HiddenCompletionAction"                                                                                                                                                                                                                                                                                                                         |
 |                 |            |           |     TBD                                                                                                                                                                                                                                                                                                                                           |
 +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| “ValueType”     | Value Type | Required  | Kernel argument value type. Only present if “ValueKind” is “ByValue”. For vector data types, the value is for the element type.Values include:                                                                                                                                                                                                    |
-|                 |            |           |   * “Struct”                                                                                                                                                                                                                                                                                                                                      |
-|                 |            |           |   * “I8”                                                                                                                                                                                                                                                                                                                                          |
-|                 |            |           |   * “U8”                                                                                                                                                                                                                                                                                                                                          |
-|                 |            |           |   * “I16”                                                                                                                                                                                                                                                                                                                                         |
-|                 |            |           |   * “U16”                                                                                                                                                                                                                                                                                                                                         |
-|                 |            |           |   * “F16”                                                                                                                                                                                                                                                                                                                                         |
-|                 |            |           |   * “I32”                                                                                                                                                                                                                                                                                                                                         |
-|                 |            |           |   * “U32”                                                                                                                                                                                                                                                                                                                                         |
-|                 |            |           |   * “F32”                                                                                                                                                                                                                                                                                                                                         |
-|                 |            |           |   * “I64”                                                                                                                                                                                                                                                                                                                                         |
-|                 |            |           |   * “U64”                                                                                                                                                                                                                                                                                                                                         |
-|                 |            |           |   * “F64”                                                                                                                                                                                                                                                                                                                                         |
+| "ValueType"     | Value Type | Required  | Kernel argument value type. Only present if "ValueKind" is "ByValue". For vector data types, the value is for the element type.Values include:                                                                                                                                                                                                    |
+|                 |            |           |   * "Struct"                                                                                                                                                                                                                                                                                                                                      |
+|                 |            |           |   * "I8"                                                                                                                                                                                                                                                                                                                                          |
+|                 |            |           |   * "U8"                                                                                                                                                                                                                                                                                                                                          |
+|                 |            |           |   * "I16"                                                                                                                                                                                                                                                                                                                                         |
+|                 |            |           |   * "U16"                                                                                                                                                                                                                                                                                                                                         |
+|                 |            |           |   * "F16"                                                                                                                                                                                                                                                                                                                                         |
+|                 |            |           |   * "I32"                                                                                                                                                                                                                                                                                                                                         |
+|                 |            |           |   * "U32"                                                                                                                                                                                                                                                                                                                                         |
+|                 |            |           |   * "F32"                                                                                                                                                                                                                                                                                                                                         |
+|                 |            |           |   * "I64"                                                                                                                                                                                                                                                                                                                                         |
+|                 |            |           |   * "U64"                                                                                                                                                                                                                                                                                                                                         |
+|                 |            |           |   * "F64"                                                                                                                                                                                                                                                                                                                                         |
++-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| "PointeeAlign"  | integer    |           | Alignment in bytes of pointee type for pointer type kernel argument. Must be a power of 2. Only present if "ValueKind" is "DynamicSharedPointer".                                                                                                                                                                                                 |
 +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| “PointeeAlign”  | integer    |           | Alignment in bytes of pointee type for pointer type kernel argument. Must be a power of 2. Only present if “ValueKind” is “DynamicSharedPointer”.                                                                                                                                                                                                 |
+| "AddrSpaceQual" | string     |           | Kernel argument address space qualifier. Only present if "ValueKind" is "GlobalBuffer" or "DynamicSharedPointer".Values are :                                                                                                                                                                                                                     |
+|                 |            |           |   * "Private"                                                                                                                                                                                                                                                                                                                                     |
+|                 |            |           |   * "Global"                                                                                                                                                                                                                                                                                                                                      |
+|                 |            |           |   * "Constant"                                                                                                                                                                                                                                                                                                                                    |
+|                 |            |           |   * "Local"                                                                                                                                                                                                                                                                                                                                       |
+|                 |            |           |   * "Generic"                                                                                                                                                                                                                                                                                                                                     |
+|                 |            |           |   * "Region"                                                                                                                                                                                                                                                                                                                                      |
 +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| “AddrSpaceQual” | string     |           | Kernel argument address space qualifier. Only present if “ValueKind” is “GlobalBuffer” or “DynamicSharedPointer”.Values are :                                                                                                                                                                                                                     |
-|                 |            |           |   * “Private”                                                                                                                                                                                                                                                                                                                                     |
-|                 |            |           |   * “Global”                                                                                                                                                                                                                                                                                                                                      |
-|                 |            |           |   * “Constant”                                                                                                                                                                                                                                                                                                                                    |
-|                 |            |           |   * “Local”                                                                                                                                                                                                                                                                                                                                       |
-|                 |            |           |   * “Generic”                                                                                                                                                                                                                                                                                                                                     |
-|                 |            |           |   * “Region”                                                                                                                                                                                                                                                                                                                                      |
+| "AccQual"       | string     |           | Kernel argument access qualifier. Only present if "ValueKind" is "Image" or "Pipe". Values are :                                                                                                                                                                                                                                                  |
+|                 |            |           |   * "ReadOnly"                                                                                                                                                                                                                                                                                                                                    |
+|                 |            |           |   * "WriteOnly"                                                                                                                                                                                                                                                                                                                                   |
+|                 |            |           |   * "ReadWrite"                                                                                                                                                                                                                                                                                                                                   |
 +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| “AccQual”       | string     |           | Kernel argument access qualifier. Only present if “ValueKind” is “Image” or “Pipe”. Values are :                                                                                                                                                                                                                                                  |
-|                 |            |           |   * “ReadOnly”                                                                                                                                                                                                                                                                                                                                    |
-|                 |            |           |   * “WriteOnly”                                                                                                                                                                                                                                                                                                                                   |
-|                 |            |           |   * “ReadWrite”                                                                                                                                                                                                                                                                                                                                   |
+| "ActualAcc"     | string     |           | The actual memory accesses performed by the kernel on the kernel argument.Only present if "ValueKind" is "GlobalBuffer", "Image", or "Pipe". This may be more restrictive than indicated by "AccQual" to reflect what the kernel actual does.If not present then the runtime must assume what is implied by "AccQual" and "IsConst". Values are : |
+|                 |            |           |   * "ReadOnly"                                                                                                                                                                                                                                                                                                                                    |
+|                 |            |           |   * "WriteOnly"                                                                                                                                                                                                                                                                                                                                   |
+|                 |            |           |   * "ReadWrite"                                                                                                                                                                                                                                                                                                                                   |
 +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| “ActualAcc”     | string     |           | The actual memory accesses performed by the kernel on the kernel argument.Only present if “ValueKind” is “GlobalBuffer”, “Image”, or “Pipe”. This may be more restrictive than indicated by “AccQual” to reflect what the kernel actual does.If not present then the runtime must assume what is implied by “AccQual” and “IsConst”. Values are : |
-|                 |            |           |   * “ReadOnly”                                                                                                                                                                                                                                                                                                                                    |
-|                 |            |           |   * “WriteOnly”                                                                                                                                                                                                                                                                                                                                   |
-|                 |            |           |   * “ReadWrite”                                                                                                                                                                                                                                                                                                                                   |
+| "IsConst"       | boolean    |           | Indicates if the kernel argument is const qualified. Only present if "ValueKind" is "GlobalBuffer".                                                                                                                                                                                                                                               |
 +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| “IsConst”       | boolean    |           | Indicates if the kernel argument is const qualified. Only present if “ValueKind” is “GlobalBuffer”.                                                                                                                                                                                                                                               |
+| "IsRestrict"    | boolean    |           | Indicates if the kernel argument is restrict qualified. Only present if "ValueKind" is "GlobalBuffer".                                                                                                                                                                                                                                            |
 +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| “IsRestrict”    | boolean    |           | Indicates if the kernel argument is restrict qualified. Only present if “ValueKind” is “GlobalBuffer”.                                                                                                                                                                                                                                            |
+| "IsVolatile"    | boolean    |           | Indicates if the kernel argument is volatile qualified. Only present if "ValueKind" is "GlobalBuffer".                                                                                                                                                                                                                                            |
 +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| “IsVolatile”    | boolean    |           | Indicates if the kernel argument is volatile qualified. Only present if “ValueKind” is “GlobalBuffer”.                                                                                                                                                                                                                                            |
+| "IsPipe"        | boolean    |           | Indicates if the kernel argument is pipe qualified. Only present if "ValueKind" is "Pipe".                                                                                                                                                                                                                                                        |
 +-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| “IsPipe”        | boolean    |           | Indicates if the kernel argument is pipe qualified. Only present if “ValueKind” is “Pipe”.                                                                                                                                                                                                                                                        |
-+-----------------+------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+  
-  
+
    **AMDHSA Code Object Kernel Code Properties Metadata Mapping**
-   
-   
+
+
 +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+
 | String Key                | Value Type | Required? | Description                                                                                                              |
 +===========================+============+===========+==========================================================================================================================+
-| “KernargSegmentSize”      | integer    | Required  | The size in bytes of the kernarg segment that holds the values of the arguments to the kernel.                           |
+| "KernargSegmentSize"      | integer    | Required  | The size in bytes of the kernarg segment that holds the values of the arguments to the kernel.                           |
 +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+
-| “GroupSegmentFixedSize”   | integer    | Required  | The amount of group segment memory required by a work-group in bytes.                                                    |
+| "GroupSegmentFixedSize"   | integer    | Required  | The amount of group segment memory required by a work-group in bytes.                                                    |
 |                           |            |           | This does not include any dynamically allocated group segment memory that may be added when the kernel is dispatched.    |
 +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+
-| “PrivateSegmentFixedSize” | integer    | Required  | The amount of fixed private address space memory required for a work-item in bytes.                                      |
+| "PrivateSegmentFixedSize" | integer    | Required  | The amount of fixed private address space memory required for a work-item in bytes.                                      |
 |                           |            |           |  If IsDynamicCallstack is 1 then additional space must be added to this value for the call stack.                        |
 +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+
-| “KernargSegmentAlign”     | integer    | Required  | The maximum byte alignment of arguments in the kernarg segment. Must be a power of 2.                                    |
+| "KernargSegmentAlign"     | integer    | Required  | The maximum byte alignment of arguments in the kernarg segment. Must be a power of 2.                                    |
 +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+
-| “WavefrontSize”           | integer    | Required  | Wavefront size. Must be a power of 2.                                                                                    |
+| "WavefrontSize"           | integer    | Required  | Wavefront size. Must be a power of 2.                                                                                    |
 +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+
-| “NumSGPRs”                | integer    |           | Number of scalar registers used by a wavefront for GFX6-GFX9.                                                            |
+| "NumSGPRs"                | integer    |           | Number of scalar registers used by a wavefront for GFX6-GFX9.                                                            |
 |                           |            |           | This includes the special SGPRs for VCC, Flat Scratch (GFX7-GFX9) and XNACK (for GFX8-GFX9).                             |
 |                           |            |           |  It does not include the 16 SGPR added if a trap handler is enabled. It is not rounded up to the allocation granularity. |
 +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+
-| “NumVGPRs”                | integer    |           | Number of vector registers used by each work-item for GFX6-GFX9                                                          |
+| "NumVGPRs"                | integer    |           | Number of vector registers used by each work-item for GFX6-GFX9                                                          |
 +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+
-| “MaxFlatWorkgroupSize”    | integer    |           | Maximum flat work-group size supported by the kernel in work-items.                                                      |
+| "MaxFlatWorkgroupSize"    | integer    |           | Maximum flat work-group size supported by the kernel in work-items.                                                      |
 +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+
-| “IsDynamicCallStack”      | boolean    |           | Indicates if the generated machine code is using a dynamically sized call stack.                                         |
+| "IsDynamicCallStack"      | boolean    |           | Indicates if the generated machine code is using a dynamically sized call stack.                                         |
 +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+
-| “IsXNACKEnabled”          | boolean    |           | Indicates if the generated machine code is capable of supporting XNACK.                                                  |
+| "IsXNACKEnabled"          | boolean    |           | Indicates if the generated machine code is capable of supporting XNACK.                                                  |
 +---------------------------+------------+-----------+--------------------------------------------------------------------------------------------------------------------------+
- 
+
     **AMDHSA Code Object Kernel Debug Properties Metadata Mapping**
-    
+
 +-------------------------------------+------------+-----------+-------------+
 | String Key                          | Value Type | Required? | Description |
 +=====================================+============+===========+=============+
-| “DebuggerABIVersion”                | string     |           |             |
+| "DebuggerABIVersion"                | string     |           |             |
++-------------------------------------+------------+-----------+-------------+
+| "ReservedNumVGPRs"                  | integer    |           |             |
 +-------------------------------------+------------+-----------+-------------+
-| “ReservedNumVGPRs”                  | integer    |           |             |
+| "ReservedFirstVGPR"                 | integer    |           |             |
 +-------------------------------------+------------+-----------+-------------+
-| “ReservedFirstVGPR”                 | integer    |           |             |
+| "PrivateSegmentBufferSGPR"          | integer    |           |             |
 +-------------------------------------+------------+-----------+-------------+
-| “PrivateSegmentBufferSGPR”          | integer    |           |             |
+| "WavefrontPrivateSegmentOffsetSGPR" | integer    |           |             |
 +-------------------------------------+------------+-----------+-------------+
-| “WavefrontPrivateSegmentOffsetSGPR” | integer    |           |             |
-+-------------------------------------+------------+-----------+-------------+ 
 
 
 .. _Kernel Dispatch:
@@ -1085,7 +1085,7 @@ To dispatch a kernel the following actions are performed. This can occur in the
    1. A pointer to an AQL queue for the kernel agent on which the kernel is to be executed is obtained.
    2. A pointer to the kernel descriptor (see Kernel Descriptor) of the kernel to execute is obtained. It must be for a kernel that is contained in a code object that that was loaded by the ROCm runtime on the kernel agent with which the AQL queue is associated.
    3. Space is allocated for the kernel arguments using the ROCm runtime allocator for a memory region with the kernarg property for the kernel agent that will execute the kernel. It must be at least 16 byte aligned.
-   4. Kernel argument values are assigned to the kernel argument memory allocation. The layout is defined in the HSA Programmer’s Language Reference [HSA]. For AMDGPU the kernel execution directly accesses the kernel argument memory in the same way constant memory is accessed. (Note that the HSA specification allows an implementation to copy the kernel argument contents to another location that is accessed by the kernel.)
+   4. Kernel argument values are assigned to the kernel argument memory allocation. The layout is defined in the HSA Programmer's Language Reference [HSA]. For AMDGPU the kernel execution directly accesses the kernel argument memory in the same way constant memory is accessed. (Note that the HSA specification allows an implementation to copy the kernel argument contents to another location that is accessed by the kernel.)
    5. An AQL kernel dispatch packet is created on the AQL queue. The ROCm runtime api uses 64 bit atomic operations to reserve space in the AQL queue for the packet. The packet must be set up, and the final write must use an atomic store release to set the packet kind to ensure the packet contents are visible to the kernel agent. AQL defines a doorbell signal mechanism to notify the kernel agent that the AQL queue has been updated. These rules, and the layout of the AQL queue and kernel dispatch packet is defined in the HSA System Architecture Specification [HSA].
    6. A kernel dispatch packet includes information about the actual dispatch, such as grid and work-group size, together with information from the code object about the kernel, such as segment sizes. The ROCm runtime queries on the kernel symbol can be used to obtain the code object values which are recorded in the Code Object Metadata.
    7. CP executes micro-code and is responsible for detecting and setting up the GPU to execute the wavefronts of a kernel dispatch.
@@ -1170,7 +1170,7 @@ Kernel Descriptor for GFX6-GFX9
 CP microcode requires the Kernel descritor to be allocated on 64 byte alignment.
 
     Kernel Descriptor for GFX6-GFX9
- 
+
 +---------+--------------------------+-------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | Bits    | Size                     | Field Name                          | Description                                                                                                                                                                                                    |
 +=========+==========================+=====================================+================================================================================================================================================================================================================+
@@ -1186,7 +1186,7 @@ CP microcode requires the Kernel descritor to be allocated on 64 byte alignment.
 +---------+--------------------------+-------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | 127:98  | 30 bits                  |                                     | Reserved. Must be 0.                                                                                                                                                                                           |
 +---------+--------------------------+-------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| 191:128 | 8 bytes                  | kernel_code_entry_byte_offset       | Byte offset (possibly negative) from base address of kernel descriptor to kernel’s entry point instruction which must be 256 byte aligned.                                                                     |
+| 191:128 | 8 bytes                  | kernel_code_entry_byte_offset       | Byte offset (possibly negative) from base address of kernel descriptor to kernel's entry point instruction which must be 256 byte aligned.                                                                     |
 +---------+--------------------------+-------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | 383:192 | 24 bytes                 |                                     | Reserved. Must be 0.                                                                                                                                                                                           |
 +---------+--------------------------+-------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
@@ -1225,7 +1225,7 @@ CP microcode requires the Kernel descritor to be allocated on 64 byte alignment.
 
 
     **compute_pgm_rsrc1 for GFX6-GFX9**
-    
+
 +-------+------------------------+---------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | Bits  | Size                   | Field Name                      | Description                                                                                                                                                                                                                                                                          |
 +=======+========================+=================================+======================================================================================================================================================================================================================================================================================+
@@ -1267,7 +1267,7 @@ CP microcode requires the Kernel descritor to be allocated on 64 byte alignment.
 |       |                        |                                 | CP is responsible for filling in ``COMPUTE_PGM_RSRC1.PRIV.``                                                                                                                                                                                                                         |
 +-------+------------------------+---------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | 21    | 1 bit                  | enable_dx10_clamp               | Wavefront starts execution with DX10 clamp mode enabled.                                                                                                                                                                                                                             |
-|       |                        |                                 | Used by the vector ALU to force DX-10 style treatment of NaN’s (when set, clamp NaN to zero, otherwise pass NaN through).                                                                                                                                                            |
+|       |                        |                                 | Used by the vector ALU to force DX-10 style treatment of NaN's (when set, clamp NaN to zero, otherwise pass NaN through).                                                                                                                                                            |
 |       |                        |                                 | Used by CP to set up`` COMPUTE_PGM_RSRC1.DX10_CLAMP.``                                                                                                                                                                                                                               |
 +-------+------------------------+---------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | 22    | 1 bit                  | debug_mode                      | Must be 0.                                                                                                                                                                                                                                                                           |
@@ -1289,10 +1289,10 @@ CP microcode requires the Kernel descritor to be allocated on 64 byte alignment.
 +-------+------------------------+---------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | 32    | **Total size 4 bytes** |                                 |                                                                                                                                                                                                                                                                                      |
 +-------+------------------------+---------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
- 
+
  **compute_pgm_rsrc2 for GFX6-GFX9**
 
-    
+
 +-------+---------------------+-------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | Bits  | Size                | Field Name                                      | Description                                                                                                                                                                                   |
 +=======+=====================+=================================================+===============================================================================================================================================================================================+
@@ -1362,10 +1362,10 @@ CP microcode requires the Kernel descritor to be allocated on 64 byte alignment.
 +-------+---------------------+-------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | 32    | Total size 4 bytes. |                                                 |                                                                                                                                                                                               |
 +-------+---------------------+-------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-   
-   
-    Floating Point Rounding Mode Enumeration Values 
-    
+
+
+    Floating Point Rounding Mode Enumeration Values
+
 +-------------------------------------+-------+------------------------+
 | Enumeration Name                    | Value | Description            |
 +=====================================+=======+========================+
@@ -1378,7 +1378,7 @@ CP microcode requires the Kernel descritor to be allocated on 64 byte alignment.
 | AMD_FLOAT_ROUND_MODE_ZERO           | 3     | Round Toward 0         |
 +-------------------------------------+-------+------------------------+
 
-	Floating Point Denorm Mode 
+	Floating Point Denorm Mode
 +-------------------------------------+-------+--------------------------------------+
 | Enumeration Values Enumeration Name | Value | Description                          |
 +=====================================+=======+======================================+
@@ -1392,8 +1392,8 @@ CP microcode requires the Kernel descritor to be allocated on 64 byte alignment.
 +-------------------------------------+-------+--------------------------------------+
 
 
-    			System VGPR Work-Item ID 
-    
+    			System VGPR Work-Item ID
+
 +---------------------------------------+-------+-----------------------------------------+
 | Enumeration Values Enumeration Name   | Value | Description                             |
 +=======================================+=======+=========================================+
@@ -1447,13 +1447,13 @@ SGPR register initial state is defined in SGPR Register Set Up Order.
 |            |                                                                                              |                 |  FLAT_SCRATCH_HI corresponds to SGPRn-4 on GFX7, and SGPRn-6 on GFX8 (where SGPRn is the highest numbered SGPR allocated to the wave).                                                                                                                      |
 |            |                                                                                              |                 |  FLAT_SCRATCH_HI is multiplied by 256 (as it is in units of 256 bytes) and added to SH_HIDDEN_PRIVATE_BASE_VIMID to calculate the per wave FLAT SCRATCH BASE in flat memory instructions that access the scratch apperture.                                 |
 |            |                                                                                              |                 |                                                                                                                                                                                                                                                             |
-|            |                                                                                              |                 |  The second SGPR is 32 bit byte size of a single work-item’s scratch memory usage.                                                                                                                                                                          |
-|            |                                                                                              |                 |  CP obtains this from the runtime, and it is always a multiple of DWORD. CP checks that the value in the kernel dispatch packet Private Segment Byte Size is not larger, and requests the runtime to increase the queue’s scratch size if necessary.        |
+|            |                                                                                              |                 |  The second SGPR is 32 bit byte size of a single work-item's scratch memory usage.                                                                                                                                                                          |
+|            |                                                                                              |                 |  CP obtains this from the runtime, and it is always a multiple of DWORD. CP checks that the value in the kernel dispatch packet Private Segment Byte Size is not larger, and requests the runtime to increase the queue's scratch size if necessary.        |
 |            |                                                                                              |                 |  The kernel code must move it to FLAT_SCRATCH_LO which is SGPRn-3 on GFX7 and SGPRn-5 on GFX8. FLAT_SCRATCH_LO is used as the FLAT SCRATCH SIZE in flat memory instructions.                                                                                |
 |            |                                                                                              |                 |  Having CP load it once avoids loading it at the beginning of every wavefront. GFX9 This is the 64 bit base address of the per SPI scratch backing memory managed by SPI for the queue executing the kernel dispatch. CP obtains this from the runtime      |
 |            |                                                                                              |                 |  (and divides it if there are multiple Shader Arrays each with its own SPI).                                                                                                                                                                                |
 |            |                                                                                              |                 |  The value of Scratch Wave Offset must be added by the kernel machine code and the result moved to the FLAT_SCRATCH SGPR which is SGPRn-6 and SGPRn-5.                                                                                                      |
-|            |                                                                                              |                 |  It is used as the FLAT SCRATCH BASE in flat memory instructions. then Private Segment Size 1 The 32 bit byte size of a (enable_sgpr_private single work-item’s scratch_segment_size) memory allocation.                                                    |
+|            |                                                                                              |                 |  It is used as the FLAT SCRATCH BASE in flat memory instructions. then Private Segment Size 1 The 32 bit byte size of a (enable_sgpr_private single work-item's scratch_segment_size) memory allocation.                                                    |
 |            |                                                                                              |                 |  This is the value from the kernel dispatch packet Private Segment Byte Size rounded up by CP to a multiple of DWORD.                                                                                                                                       |
 |            |                                                                                              |                 |  Having CP load it once avoids loading it at the beginning of every wavefront.                                                                                                                                                                              |
 |            |                                                                                              |                 |                                                                                                                                                                                                                                                             |
@@ -1477,7 +1477,7 @@ SGPR register initial state is defined in SGPR Register Set Up Order.
 +------------+----------------------------------------------------------------------------------------------+-----------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | then       | Work-Group Id Z (enable_sgpr_workgroup_id _Z)                                                | 1               | 32 bit work-group id in Z dimension of grid for wavefront.                                                                                                                                                                                                  |
 +------------+----------------------------------------------------------------------------------------------+-----------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| then       | Work-Group Info (enable_sgpr_workgroup _info)                                                | 1               | {first_wave, 14’b0000, ordered_append_term[10:0], threadgroup_size_in_waves[5:0]}                                                                                                                                                                           |
+| then       | Work-Group Info (enable_sgpr_workgroup _info)                                                | 1               | {first_wave, 14'b0000, ordered_append_term[10:0], threadgroup_size_in_waves[5:0]}                                                                                                                                                                           |
 +------------+----------------------------------------------------------------------------------------------+-----------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | then       | Scratch Wave Offset (enable_sgpr_private _segment_wave_offset)                               | 1               | 32 bit byte offset from base of scratch base of queue executing the kernel dispatch.                                                                                                                                                                        |
 |            |                                                                                              |                 | Must be used as an offset with Private segment address when using Scratch Segment Buffer.                                                                                                                                                                   |
@@ -1489,7 +1489,7 @@ The order of the VGPR registers is defined, but the compiler can specify which o
 VGPR register initial state is defined in VGPR Register Set Up Order.
 
     VGPR Register Set Up Order
-    
+
 +------------+----------------------------------------------+-----------------+----------------------------------------------------------------------+
 | VGPR Order | Name (kernel descriptor enable field)        | Number of VGPRs | Description                                                          |
 +============+==============================================+=================+======================================================================+
@@ -1542,9 +1542,9 @@ If the kernel may use flat operations to access scratch memory, the prolog code
 
 GFX6
     Flat scratch is not supported.
-    
+
 GFX7-8
-    1. The low word of Flat Scratch Init is 32 bit byte offset from SH_HIDDEN_PRIVATE_BASE_VIMID to the base of scratch backing memory being managed by SPI for the queue executing the kernel dispatch. This is the same value used in the Scratch Segment Buffer V# base address. The prolog must add the value of Scratch Wave Offset to get the wave’s byte scratch backing memory offset from SH_HIDDEN_PRIVATE_BASE_VIMID. Since FLAT_SCRATCH_LO is in units of 256 bytes, the offset must be right shifted by 8 before moving into FLAT_SCRATCH_LO.
+    1. The low word of Flat Scratch Init is 32 bit byte offset from SH_HIDDEN_PRIVATE_BASE_VIMID to the base of scratch backing memory being managed by SPI for the queue executing the kernel dispatch. This is the same value used in the Scratch Segment Buffer V# base address. The prolog must add the value of Scratch Wave Offset to get the wave's byte scratch backing memory offset from SH_HIDDEN_PRIVATE_BASE_VIMID. Since FLAT_SCRATCH_LO is in units of 256 bytes, the offset must be right shifted by 8 before moving into FLAT_SCRATCH_LO.
     2. The second word of Flat Scratch Init is 32 bit byte size of a single work-items scratch memory usage. This is directly loaded from the kernel dispatch packet Private Segment Byte Size and rounded up to a multiple of DWORD. Having CP load it once avoids loading it at the beginning of every wavefront. The prolog must move it to FLAT_SCRATCH_LO for use as FLAT SCRATCH SIZE.
 
 GFX9
@@ -1953,7 +1953,7 @@ On dGPU the kernarg backing memory is accessed as UC (uncached) to avoid needing
 The memory order also adds the single thread optimization constrains defined in table AMDHSA Memory Model Single Thread Optimization Constraints GFX6-GFX9.
 
     AMDHSA Memory Model Single Thread Optimization Constraints GFX6-GFX9
-    
+
 +-------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | LLVM Memory | Optimization Constraints                                                                                                                                                                   |
 +=============+============================================================================================================================================================================================+
@@ -2000,8 +2000,8 @@ For code objects generated by AMDGPU backend for HSA [HSA] compatible runtimes (
 |                     |               |  queue_ptr          |                                                                                                                                                          |
 +---------------------+---------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+
 | llvm.debugtrap      | s_trap 0x03   |                     | If debugger not installed then behaves as a no-operation. The trap handler is entered and immediately returns to continue execution of the wavefront.    |
-|                     |               |                     | If the debugger is installed, causes the debug trap to be reported by the debugger and the wavefront is put in the halt state until resumed by debugger. |                                         
-+---------------------+---------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+                                                                                                        
+|                     |               |                     | If the debugger is installed, causes the debug trap to be reported by the debugger and the wavefront is put in the halt state until resumed by debugger. |
++---------------------+---------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+
 | reserved            | s_trap 0x04   |                     | Reserved                                                                                                                                                 |
 +---------------------+---------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+
 | reserved            | s_trap 0x05   |                     | Reserved                                                                                                                                                 |
@@ -2012,7 +2012,7 @@ For code objects generated by AMDGPU backend for HSA [HSA] compatible runtimes (
 +---------------------+---------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+
 | reserved            | s_trap 0x08   |                     | Reserved                                                                                                                                                 |
 +---------------------+---------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+
-| reserved            | s_trap 0xfe   |                     | Reserved                                                                                                                                                 | 
+| reserved            | s_trap 0xfe   |                     | Reserved                                                                                                                                                 |
 +---------------------+---------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+
 | reserved            | s_trap 0xff   |                     | Reserved                                                                                                                                                 |
 +---------------------+---------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+
@@ -2041,17 +2041,17 @@ Note that there are always 10 available user data entries in registers - entries
 
         **PAL Compute Shader User Data Registers**
 
-  ================ ===================================================== 
-   User Register 	  Description	 
-  ================ ===================================================== 
+  ================ =====================================================
+   User Register 	  Description
+  ================ =====================================================
    0	             Global Internal Table (32-bit pointer)
    1	             Per-Shader Internal Table (32-bit pointer)
    2 - 11	     Application-Controlled User Data (10 32-bit values)
    12	             Spill Table (32-bit pointer)
    13 - 14           Thread Group Count (64-bit pointer)
-   15          	     GDS Range  
-  ================ =====================================================                
- 
+   15          	     GDS Range
+  ================ =====================================================
+
 .. _Graphics-User-Data:
 
 Graphics User Data
@@ -2108,7 +2108,7 @@ The following table illustrates the required format:
     52	        vaRange::ShadowDescriptorTable High Bits
   =========  ==============================================
 
-The pointer to the global internal table passed to the shader as user data is a 32-bit pointer. The top 32 bits should be assumed to be the same as the top 32 bits of the pipeline, so the shader may use the program counter’s top 32 bits.
+The pointer to the global internal table passed to the shader as user data is a 32-bit pointer. The top 32 bits should be assumed to be the same as the top 32 bits of the pipeline, so the shader may use the program counter's top 32 bits.
 
 
 .. _Unspecified OS:
@@ -2218,7 +2218,7 @@ The following syntax for register operands is supported:
  * Register pairs, quads, etc: s[2:3], v[10:11], ttmp[5:6], s[4:7], v[12:15], ttmp[4:7], s[8:15], ...
  * Register lists: [s0, s1], [ttmp0, ttmp1, ttmp2, ttmp3]
  * Register index expressions: v[2*2], s[1-1:2-1]
- * ‘off’ indicates that an operand is not enabled
+ * 'off' indicates that an operand is not enabled
 
 The following extra operands are supported:
 
@@ -2258,29 +2258,29 @@ DS
 ***
 
 ::
- 
+
  ds_add_u32 v2, v4 offset:16
  ds_write_src2_b64 v2 offset0:4 offset1:8
- ds_cmpst_f32 v2, v4, v6 
+ ds_cmpst_f32 v2, v4, v6
  ds_min_rtn_f64 v[8:9], v2, v[4:5]
- 
 
-For full list of supported instructions, refer to “LDS/GDS instructions” in ISA Manual.
+
+For full list of supported instructions, refer to "LDS/GDS instructions" in ISA Manual.
 
 .. _FLAT:
 
 FLAT
 *****
 ::
- 
+
  flat_load_dword v1, v[3:4]
  flat_store_dwordx3 v[3:4], v[5:7]
  flat_atomic_swap v1, v[3:4], v5 glc
  flat_atomic_cmpswap v1, v[3:4], v[5:6] glc slc
  flat_atomic_fmax_x2 v[1:2], v[3:4], v[5:6] glc
- 
 
-For full list of supported instructions, refer to “FLAT instructions” in ISA Manual.
+
+For full list of supported instructions, refer to "FLAT instructions" in ISA Manual.
 
 
 .. _MUBUF:
@@ -2288,35 +2288,35 @@ For full list of supported instructions, refer to “FLAT instructions” in ISA
 MUBUF
 ******
 ::
-  
+
  buffer_load_dword v1, off, s[4:7], s1
  buffer_store_dwordx4 v[1:4], v2, ttmp[4:7], s1 offen offset:4 glc tfe
  buffer_store_format_xy v[1:2], off, s[4:7], s1
  buffer_wbinvl1
  buffer_atomic_inc v1, v2, s[8:11], s4 idxen offset:4 slc
 
-For full list of supported instructions, refer to “MUBUF Instructions” in ISA Manual.
+For full list of supported instructions, refer to "MUBUF Instructions" in ISA Manual.
 
 .. _SMRD/SMEM:
 
 SMRD/SMEM
 **********
 ::
- 
+
  s_load_dword s1, s[2:3], 0xfc
  s_load_dwordx8 s[8:15], s[2:3], s4
  s_load_dwordx16 s[88:103], s[2:3], s4
  s_dcache_inv_vol
  s_memtime s[4:5]
 
-For full list of supported instructions, refer to “Scalar Memory Operations” in ISA Manual.
+For full list of supported instructions, refer to "Scalar Memory Operations" in ISA Manual.
 
 .. _SOP1:
 
 SOP1
 *****
 ::
- 
+
  s_mov_b32 s1, s2
  s_mov_b64 s[0:1], 0x80000000
  s_cmov_b32 s1, 200
@@ -2325,14 +2325,14 @@ SOP1
  s_swappc_b64 s[2:3], s[4:5]
  s_cbranch_join s[4:5]
 
-For full list of supported instructions, refer to “SOP1 Instructions” in ISA Manual.
+For full list of supported instructions, refer to "SOP1 Instructions" in ISA Manual.
 
 .. _SOP2:
 
 SOP2
 *****
 ::
- 
+
  s_add_u32 s1, s2, s3
  s_and_b64 s[2:3], s[4:5], s[6:7]
  s_cselect_b32 s1, s2, s3
@@ -2342,28 +2342,28 @@ SOP2
  s_bfm_b64 s[2:3], s4, s6
  s_bfe_i64 s[2:3], s[4:5], s6
  s_cbranch_g_fork s[4:5], s[6:7]
- 
-For full list of supported instructions, refer to “SOP2 Instructions” in ISA Manual.
+
+For full list of supported instructions, refer to "SOP2 Instructions" in ISA Manual.
 
 .. _SOPC:
 
 SOPC
 *****
 ::
- 
+
  s_cmp_eq_i32 s1, s2
  s_bitcmp1_b32 s1, s2
  s_bitcmp0_b64 s[2:3], s4
  s_setvskip s3, s5
- 
-For full list of supported instructions, refer to “SOPC Instructions” in ISA Manual.
+
+For full list of supported instructions, refer to "SOPC Instructions" in ISA Manual.
 
 .. _SOPP:
 
 SOPP
 *****
 ::
- 
+
  s_barrier
  s_nop 2
  s_endpgm
@@ -2375,8 +2375,8 @@ SOPP
  s_sendmsg 0x1
  s_sendmsg sendmsg(MSG_INTERRUPT)
  s_trap 1
- 
-For full list of supported instructions, refer to “SOPP Instructions” in ISA Manual.
+
+For full list of supported instructions, refer to "SOPP Instructions" in ISA Manual.
 
 Unless otherwise mentioned, little verification is performed on the operands of SOPP Instructions, so it is up to the programmer to be familiar with the range or acceptable values.
 
@@ -2393,7 +2393,7 @@ For vector ALU instruction opcodes (VOP1, VOP2, VOP3, VOPC, VOP_DPP, VOP_SDWA),
 
 VOP1/VOP2/VOP3/VOPC examples
 *****************************
- 
+
 ::
 
  v_mov_b32 v1, v2
@@ -2411,9 +2411,9 @@ VOP1/VOP2/VOP3/VOPC examples
 
 VOP_DPP examples
 ******************
- 
+
 ::
-  
+
  v_mov_b32 v0, v0 quad_perm:[0,2,1,1]
  v_sin_f32 v0, v0 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
  v_mov_b32 v0, v0 wave_shl:1
@@ -2427,14 +2427,14 @@ VOP_SDWA examples
 ******************
 
 ::
- 
+
  v_mov_b32 v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
  v_min_u32 v200, v200, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
  v_sin_f32 v0, v0 dst_unused:UNUSED_PAD src0_sel:WORD_1
  v_fract_f32 v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
  v_cmpx_le_u32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
- 
-For full list of supported instructions, refer to “Vector ALU instructions”.
+
+For full list of supported instructions, refer to "Vector ALU instructions".
 
 
 .. _Code Object V2 Predefined Symbols (-mattr=-code-object-v3):
@@ -2457,7 +2457,7 @@ The AMDGPU assembler defines and updates some symbols automatically. These symbo
 .option.machine_version_major
 ++++++++++++++++++++++++++++++
 
-Set to the GFX major generation number of the target being assembled for. For example, when assembling for a “GFX9” target this will be set to the integer value “9”. The possible GFX major generation numbers are presented in :ref:`Processors`.
+Set to the GFX major generation number of the target being assembled for. For example, when assembling for a "GFX9" target this will be set to the integer value "9". The possible GFX major generation numbers are presented in :ref:`Processors`.
 
 
 .. _.option.machine_version_minor:
@@ -2465,10 +2465,10 @@ Set to the GFX major generation number of the target being assembled for. For ex
 .option.machine_version_minor
 ++++++++++++++++++++++++++++++
 
-Set to the GFX minor generation number of the target being assembled for. For example, when assembling for a “GFX810” target this will be set to the integer value “1”. The possible GFX minor generation numbers are presented in :ref:`Processors`.
+Set to the GFX minor generation number of the target being assembled for. For example, when assembling for a "GFX810" target this will be set to the integer value "1". The possible GFX minor generation numbers are presented in :ref:`Processors`.
 .option.machine_version_stepping
 
-Set to the GFX stepping generation number of the target being assembled for. For example, when assembling for a “GFX704” target this will be set to the integer value “4”. The possible GFX stepping generation numbers are presented in :ref:`Processors`.
+Set to the GFX stepping generation number of the target being assembled for. For example, when assembling for a "GFX704" target this will be set to the integer value "4". The possible GFX stepping generation numbers are presented in :ref:`Processors`.
 
 
 .. _.option.machine_version_stepping:
@@ -2476,7 +2476,7 @@ Set to the GFX stepping generation number of the target being assembled for. For
 .option.machine_version_stepping
 +++++++++++++++++++++++++++++++++
 
-Set to the GFX stepping generation number of the target being assembled for. For example, when assembling for a “GFX704” target this will be set to the integer value “4”. The possible GFX stepping generation numbers are presented in :ref:`Processors`.
+Set to the GFX stepping generation number of the target being assembled for. For example, when assembling for a "GFX704" target this will be set to the integer value "4". The possible GFX stepping generation numbers are presented in :ref:`Processors`.
 
 .. _.kernel.vgpr_count:
 
@@ -2501,7 +2501,7 @@ Code Object V2 Directives (-mattr=-code-object-v3)
 
 ::
 
-  Code Object V2 is not the default code object version emitted by this version of LLVM. For a description of the directives supported 
+  Code Object V2 is not the default code object version emitted by this version of LLVM. For a description of the directives supported
   with  the default configuration (Code Object V3) see :ref:`Code Object V3 Directives (-mattr=+code-object-v3)`.
 
 AMDGPU ABI defines auxiliary data in output code object. In assembly source, one can specify them with assembler directives.
@@ -2520,7 +2520,7 @@ major and minor are integers that specify the version of the HSA code object tha
 
 major, minor, and stepping are all integers that describe the instruction set architecture (ISA) version of the assembly program.
 
-vendor and arch are quoted strings. vendor should always be equal to “AMD” and arch should always be equal to “AMDGPU”.
+vendor and arch are quoted strings. vendor should always be equal to "AMD" and arch should always be equal to "AMDGPU".
 
 By default, the assembler will derive the ISA version, vendor, and arch from the value of the -mcpu option that is passed to the assembler.
 
@@ -2561,7 +2561,7 @@ Code Object V2 Example Source Code (-mattr=-code-object-v3)
 
 ::
 
- Code Object V2 is not the default code object version emitted by this version of LLVM. For a description of the predefined symbols 
+ Code Object V2 is not the default code object version emitted by this version of LLVM. For a description of the predefined symbols
  available with the default configuration (Code Object V3).
 
 Here is an example of a minimal assembly source file, defining one HSA kernel:
@@ -2611,21 +2611,21 @@ The AMDGPU assembler defines and updates some symbols automatically. These symbo
 .amdgcn.gfx_generation_number
 ++++++++++++++++++++++++++++++
 
-Set to the GFX major generation number of the target being assembled for. For example, when assembling for a “GFX9” target this will be set to the integer value “9”. The possible GFX major generation numbers are presented in :ref:`Processors`.
+Set to the GFX major generation number of the target being assembled for. For example, when assembling for a "GFX9" target this will be set to the integer value "9". The possible GFX major generation numbers are presented in :ref:`Processors`.
 
 .. _.amdgcn.gfx_generation_minor:
 
 .amdgcn.gfx_generation_minor
 ++++++++++++++++++++++++++++++
 
-Set to the GFX minor generation number of the target being assembled for. For example, when assembling for a “GFX810” target this will be set to the integer value “1”. The possible GFX minor generation numbers are presented in :ref:`Processors`.
+Set to the GFX minor generation number of the target being assembled for. For example, when assembling for a "GFX810" target this will be set to the integer value "1". The possible GFX minor generation numbers are presented in :ref:`Processors`.
 
 .. _.amdgcn.gfx_generation_stepping:
 
 .amdgcn.gfx_generation_stepping
 +++++++++++++++++++++++++++++++++
 
-Set to the GFX stepping generation number of the target being assembled for. For example, when assembling for a “GFX704” target this will be set to the integer value “4”. The possible GFX stepping generation numbers are presented in :ref:`Processors`.
+Set to the GFX stepping generation number of the target being assembled for. For example, when assembling for a "GFX704" target this will be set to the integer value "4". The possible GFX stepping generation numbers are presented in :ref:`Processors`.
 
 .. _.amdgcn.next_free_vgpr:
 
@@ -2670,7 +2670,7 @@ Optional directive which declares the target supported by the containing assembl
 
 Creates a correctly aligned AMDHSA kernel descriptor and a symbol, <name>.kd, in the current location of the current section. Only valid when the OS is amdhsa. <name> must be a symbol that labels the first instruction to execute, and does not need to be previously defined.
 
-Marks the beginning of a list of directives used to generate the bytes of a kernel descriptor, as described in Kernel Descriptor. Directives which may appear in this list are described in AMDHSA Kernel Assembler Directives. Directives may appear in any order, must be valid for the target being assembled for, and cannot be repeated. Directives support the range of values specified by the field they reference in Kernel Descriptor. If a directive is not specified, it is assumed to have its default value, unless it is marked as “Required”, in which case it is an error to omit the directive. This list of directives is terminated by an .end_amdhsa_kernel directive.
+Marks the beginning of a list of directives used to generate the bytes of a kernel descriptor, as described in Kernel Descriptor. Directives which may appear in this list are described in AMDHSA Kernel Assembler Directives. Directives may appear in any order, must be valid for the target being assembled for, and cannot be repeated. Directives support the range of values specified by the field they reference in Kernel Descriptor. If a directive is not specified, it is assumed to have its default value, unless it is marked as "Required", in which case it is an error to omit the directive. This list of directives is terminated by an .end_amdhsa_kernel directive.
 
 **AMDHSA Kernel Assembler Directives**
 
@@ -2785,7 +2785,7 @@ If an assembly source file contains multiple kernels and/or functions, the .amdg
 
  .amdgcn_target "amdgcn-amd-amdhsa--gfx900+xnack" // optional
  // gpr tracking symbols are implicitly set to zero
- .text 
+ .text
  .globl kern0
  .p2align 8
  .type kern0,@function
@@ -2860,7 +2860,7 @@ Additional Documentation
 
 [AMD-GCN-GFX8]	(`1 <http://releases.llvm.org/8.0.1/docs/AMDGPUUsage.html#id9>`_, `2 <http://releases.llvm.org/8.0.1/docs/AMDGPUUsage.html#id41>`_) `AMD GCN3 Instruction Set Architecture <http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/12/AMD_GCN3_Instruction_Set_Architecture_rev1.1.pdf>`_
 
-[AMD-GCN-GFX9]	(`1 <http://releases.llvm.org/8.0.1/docs/AMDGPUUsage.html#id10>`_, `2 <http://releases.llvm.org/8.0.1/docs/AMDGPUUsage.html#id42>`_) `AMD “Vega” Instruction Set Architecture <http://developer.amd.com/wordpress/media/2013/12/Vega_Shader_ISA_28July2017.pdf>`_
+[AMD-GCN-GFX9]	(`1 <http://releases.llvm.org/8.0.1/docs/AMDGPUUsage.html#id10>`_, `2 <http://releases.llvm.org/8.0.1/docs/AMDGPUUsage.html#id42>`_) `AMD "Vega" Instruction Set Architecture <http://developer.amd.com/wordpress/media/2013/12/Vega_Shader_ISA_28July2017.pdf>`_
 
 [AMD-ROCm]	(`1 <http://releases.llvm.org/8.0.1/docs/AMDGPUUsage.html#id2>`_, `2 <http://releases.llvm.org/8.0.1/docs/AMDGPUUsage.html#id21>`_, `3 <http://releases.llvm.org/8.0.1/docs/AMDGPUUsage.html#id26>`_, `4 <http://releases.llvm.org/8.0.1/docs/AMDGPUUsage.html#id37>`_) `ROCm: Open Platform for Development, Discovery and Education Around GPU Computing <http://gpuopen.com/compute-product/rocm/>`_
 
@@ -2870,7 +2870,7 @@ Additional Documentation
 
 [`DWARF <http://releases.llvm.org/8.0.1/docs/AMDGPUUsage.html#id24>`_]	`DWARF Debugging Information Format <http://dwarfstd.org/>`_
 
-[YAML]	(`1 <http://releases.llvm.org/8.0.1/docs/AMDGPUUsage.html#id27>`_, `2 <http://releases.llvm.org/8.0.1/docs/AMDGPUUsage.html#id43>`_) `YAML Ain’t Markup Language (YAML™) Version 1.2 <http://www.yaml.org/spec/1.2/spec.html>`_
+[YAML]	(`1 <http://releases.llvm.org/8.0.1/docs/AMDGPUUsage.html#id27>`_, `2 <http://releases.llvm.org/8.0.1/docs/AMDGPUUsage.html#id43>`_) `YAML Ain't Markup Language (YAML(TM)) Version 1.2 <http://www.yaml.org/spec/1.2/spec.html>`_
 
 [MsgPack]	(`1 <http://releases.llvm.org/8.0.1/docs/AMDGPUUsage.html#id22>`_, `2 <http://releases.llvm.org/8.0.1/docs/AMDGPUUsage.html#id23>`_, `3 <http://releases.llvm.org/8.0.1/docs/AMDGPUUsage.html#id28>`_) `Message Pack <http://www.msgpack.org/>`_
 
diff --git a/ROCm_Compiler_SDK/ocml.rst b/ROCm_Compiler_SDK/ocml.rst
index 07da9ac9..3deeafd3 100644
--- a/ROCm_Compiler_SDK/ocml.rst
+++ b/ROCm_Compiler_SDK/ocml.rst
@@ -7,7 +7,7 @@ OCML User Guide
 ################
 What Is OCML
 **************
-OCML is an LLVM-IR bitcode library designed to relieve language compiler and runtime implementers of the burden of implementing efficient and accurate mathematical functions. It is essentially a “libm” in intermediate representation with a fixed, simple API that can be linked in to supply the implementations of most standard low-level mathematical functions provided by the language.
+OCML is an LLVM-IR bitcode library designed to relieve language compiler and runtime implementers of the burden of implementing efficient and accurate mathematical functions. It is essentially a "libm" in intermediate representation with a fixed, simple API that can be linked in to supply the implementations of most standard low-level mathematical functions provided by the language.
 
 Using OCML
 ***********
@@ -16,11 +16,11 @@ Standard Usage
 OCML is expected to be used in a standard LLVM compilation flow as follows:
 
   * Compile source modules to LLVM-IR bitcode (clang)
-  * Link program bitcode, “wrapper” bitcode, OCML bitcode, and OCML control functions (llvm-link)
+  * Link program bitcode, "wrapper" bitcode, OCML bitcode, and OCML control functions (llvm-link)
   * Generic optimizations (opt)
   * Code generation (llc)
 
-Here, “wrapper” bitcode denotes a thin library responsible for mapping mangled built-in function calls as produced by clang to the OCML API. An example in C might look like
+Here, "wrapper" bitcode denotes a thin library responsible for mapping mangled built-in function calls as produced by clang to the OCML API. An example in C might look like
 
 ::
 
@@ -71,9 +71,9 @@ OCML functions follow a simple naming convention:
 
 where {function} is generally the familiar libm name of the function, and {type suffix} indicates the type of the floating point arguments or results, and is one of
 
-    f16 – 16 bit floating point (half precision)
-    f32 – 32 bit floating point (single precision)
-    f64 – 64 bit floating point (double precision)
+    f16 - 16 bit floating point (half precision)
+    f32 - 32 bit floating point (single precision)
+    f64 - 64 bit floating point (double precision)
 
 For example, __ocml_sqrt_f32 is the name of the OCML single precision square root function.
 
@@ -82,7 +82,7 @@ OCML does not currently support higher than double precision due to the lack of
 Supported functions
 ********************
 
-The following table contains a list of {function} currently supported by OCML, a brief description of each, and the maximum relative error in ULPs for each floating point type. A “c” in the last 3 columns indicates that the function is required to be correctly rounded.
+The following table contains a list of {function} currently supported by OCML, a brief description of each, and the maximum relative error in ULPs for each floating point type. A "c" in the last 3 columns indicates that the function is required to be correctly rounded.
 
 +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+
 | {function} | Description                                                               | f32 max err | f64 max err | f16 max err |
@@ -91,7 +91,7 @@ The following table contains a list of {function} currently supported by OCML, a
 +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+
 | acosh      | arc hyperbolic cosine                                                     | 4           | 4           | 2           |
 +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+
-| acospi     | arc cosine / π                                                            | 5           | 5           | 2           |
+| acospi     | arc cosine / ?                                                            | 5           | 5           | 2           |
 +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+
 | add_{rm}   | add with specific rounding mode                                           | c           | c           | c           |
 +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+
@@ -189,9 +189,9 @@ The following table contains a list of {function} currently supported by OCML, a
 +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+
 | len4       | four argument hypot                                                       | 2           | 2           | 2           |
 +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+
-| lgamma     | log Γ function                                                            | 6(>0)       | 4(>0)       | 3(>0)       |
+| lgamma     | log ? function                                                            | 6(>0)       | 4(>0)       | 3(>0)       |
 +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+
-| lgamma_r   | log Γ function with sign                                                  | 6(>0)       | 4(>0)       | 3(>0)       |
+| lgamma_r   | log ? function with sign                                                  | 6(>0)       | 4(>0)       | 3(>0)       |
 +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+
 | log10      | log base 10                                                               | 3           | 3           | 2           |
 +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+
@@ -279,7 +279,7 @@ The following table contains a list of {function} currently supported by OCML, a
 +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+
 | tanpi      | tangent of argument times pi                                              | 6           | 6           | 2           |
 +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+
-| tgamma     | true Γ function                                                           | 16          | 16          | 4           |
+| tgamma     | true ? function                                                           | 16          | 16          | 4           |
 +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+
 | trunc      | round to integer, towards zero                                            | c           | c           | c           |
 +------------+---------------------------------------------------------------------------+-------------+-------------+-------------+
@@ -290,8 +290,8 @@ The following table contains a list of {function} currently supported by OCML, a
 
 For the functions supporting specific roundings, the rounding mode {rm} can be one of
 
-  * rte – round towards nearest even
-  * rtp – round towards positive infinity
-  * rtn – round towards negative infinity
-  * rtz – round towards zero
+  * rte - round towards nearest even
+  * rtp - round towards positive infinity
+  * rtn - round towards negative infinity
+  * rtz - round towards zero
 
diff --git a/ROCm_Glossary/ROCm-Glossary.rst b/ROCm_Glossary/ROCm-Glossary.rst
index 649dc3cf..eb259cbd 100644
--- a/ROCm_Glossary/ROCm-Glossary.rst
+++ b/ROCm_Glossary/ROCm-Glossary.rst
@@ -30,10 +30,10 @@ PCI Express (PCIe) was developed as the next generation I/O system interconnect
 A Queue is a runtime-allocated resource that contains a packet buffer and is associated with a packet processor. The packet processor tracks which packets in the buffer have already been processed. When it has been informed by the application that a new packet has been enqueued, the packet processor is able to process it because the packet format is standard and the packet contents are self-contained -- they include all the necessary information to run a command. A queue has an associated set of high-level operations defined in "HSA Runtime Specification" (API functions in host code) and "HSA Programmer Reference Manual Specification" (kernel code).
 
 **HSA (Heterogeneous System Architecture) :**
-HSA provides a unified view of fundamental computing elements. HSA allows a programmer to write applications that seamlessly integrate CPUs (called latency compute units) with GPUs (called throughput compute units), while benefiting from the best attributes ofeach. HSA creates an improved processor design that exposes the benefits and capabilities of mainstream programmable compute elements, working together seamlessly.HSA is all about delivering new, improved user experiences through advances in computing architectures that deliver improvements across all four key vectors: improved power efficiency; improved performance; improved programmability; and broad portability across computing devices.For more on `HSA <http://developer.amd.com/wordpress/media/2012/10/hsa10.pdf>`_. 
+HSA provides a unified view of fundamental computing elements. HSA allows a programmer to write applications that seamlessly integrate CPUs (called latency compute units) with GPUs (called throughput compute units), while benefiting from the best attributes ofeach. HSA creates an improved processor design that exposes the benefits and capabilities of mainstream programmable compute elements, working together seamlessly.HSA is all about delivering new, improved user experiences through advances in computing architectures that deliver improvements across all four key vectors: improved power efficiency; improved performance; improved programmability; and broad portability across computing devices.For more on `HSA <http://developer.amd.com/wordpress/media/2012/10/hsa10.pdf>`_.
 
 **AQL Architectured Queueing Language :**
-The Architected Queuing Language (AQL) is a standard binary interface used to describe commands such as a kernel dispatch. An AQL packet is a user-mode buffer with a specific format that encodes one command. AQL allows agents to build and enqueue their own command packets, enabling fast, low-power dispatch. AQL also provides support for kernel agent queue submissions: the kernel agent kernel can write commands in AQL format. 
+The Architected Queuing Language (AQL) is a standard binary interface used to describe commands such as a kernel dispatch. An AQL packet is a user-mode buffer with a specific format that encodes one command. AQL allows agents to build and enqueue their own command packets, enabling fast, low-power dispatch. AQL also provides support for kernel agent queue submissions: the kernel agent kernel can write commands in AQL format.
 
 
 
diff --git a/ROCm_Libraries/ROCm_Libraries.rst b/ROCm_Libraries/ROCm_Libraries.rst
index 8f4626da..c11c255c 100644
--- a/ROCm_Libraries/ROCm_Libraries.rst
+++ b/ROCm_Libraries/ROCm_Libraries.rst
@@ -1269,7 +1269,7 @@ The root of this repository has a helper bash script install.sh to build and ins
 **Manual build (all supported platforms)**
 
 If you use a distro other than Ubuntu, or would like more control over the build process, the hipblas build has helpful information on how to configure cmake and manually build.
-          
+
 Build
 ########
 
@@ -1437,7 +1437,7 @@ Batched and strided GEMM API
 -------------------------------
 hipBLAS GEMM can process matrices in batches with regular strides. There are several permutations of these API's, the following is an example that takes everything
 
-:: 
+::
 
   hipblasStatus_t
   hipblasSgemmStridedBatched( hipblasHandle_t handle,
@@ -1570,7 +1570,7 @@ Running Statistical Tests
 ::
 
   # Go to rocRAND build directory
-  cd rocRAND; cd build  
+  cd rocRAND; cd build
   # To run "crush" test, which verifies that generated pseudorandom
   # numbers are of high quality:
   # engine -> all, xorwow, mrg32k3a, mtgp32, philox
@@ -1704,7 +1704,7 @@ The following is a simple example code that shows how to use rocFFT to compute a
           // Copy result back to host
           std::vector<float2> y(N);
           hipMemcpy(y.data(), x, Nbytes, hipMemcpyDeviceToHost);
- 
+
           // Print results
           for (size_t i = 0; i < N; i++)
           {
@@ -1810,7 +1810,7 @@ Execution info
 
 The execution api :cpp:func:`rocfft_execute` takes a rocfft_execution_info parameter. This parameter needs
 to be created and setup by the user and passed to the execution api. The execution info handle encapsulates
-information such as execution mode, pointer to any work buffer etc. It can also hold information that are 
+information such as execution mode, pointer to any work buffer etc. It can also hold information that are
 side effect of execution such as event objects. The following functions deal with managing execution info
 object. Note that the *set* functions below need to be called before execution and *get* functions after
 execution.
@@ -2012,7 +2012,7 @@ rocSPARSE with dependencies and client can be built using the following commands
                                     -DBUILD_CLIENTS_BENCHMARKS=ON \
                                     -DBUILD_CLIENTS_SAMPLES=ON \
                                     -DBUILD_VERBOSE=OFF \
-                                    -DBUILD_SHARED_LIBS=ON 
+                                    -DBUILD_SHARED_LIBS=ON
 
   # Compile rocSPARSE library
   make -j$(nproc)
@@ -2028,7 +2028,7 @@ Common build problems
 
 #. **Issue:** HCC RUNTIME ERROR: Failed to find compatible kernel
 
-   **Solution:** Add the following to the cmake command when configuring: -DCMAKE_CXX_FLAGS=”–amdgpu-target=gfx803,gfx900,gfx906,gfx908”
+   **Solution:** Add the following to the cmake command when configuring: -DCMAKE_CXX_FLAGS="-amdgpu-target=gfx803,gfx900,gfx906,gfx908"
 
 #. **Issue:** Could not find a package configuration file provided by "ROCM" with any of the following names:
               ROCMConfig.cmake |br|
@@ -2046,7 +2046,7 @@ You can test the installation by running one of the rocSPARSE examples, after su
 
   # Navigate to clients binary directory
   $ cd rocSPARSE/build/release/clients/staging
- 
+
   # Execute rocSPARSE example
   $ ./example_csrmv 1000
 
@@ -2056,7 +2056,7 @@ Supported Targets
 
 Currently, rocSPARSE is supported under the following operating systems
 
-    
+
     * Ubuntu 16.04
 
     * Ubuntu 18.04
@@ -2093,7 +2093,7 @@ The above is a HIP (and CUDA) device management approach and has nothing to do w
 
 Once users set the device, they create a handle with `rocsparse_create_handle() <https://rocsparse.readthedocs.io/en/latest/library.html#rocsparse-create-handle>`_.
 
-Subsequent rocSPARSE routines take this handle as an input parameter. rocSPARSE ONLY queries (by hipGetDevice()) the user’s device; rocSPARSE does NOT set the device for users. If rocSPARSE does not see a valid device, it returns an error message. It is the users’ responsibility to provide a valid device to rocSPARSE and ensure the device safety.
+Subsequent rocSPARSE routines take this handle as an input parameter. rocSPARSE ONLY queries (by hipGetDevice()) the user's device; rocSPARSE does NOT set the device for users. If rocSPARSE does not see a valid device, it returns an error message. It is the users' responsibility to provide a valid device to rocSPARSE and ensure the device safety.
 
 Users CANNOT switch devices between `rocsparse_create_handle() <https://rocsparse.readthedocs.io/en/latest/library.html#rocsparse-create-handle>`_ and `rocsparse_destroy_handle() <https://rocsparse.readthedocs.io/en/latest/library.html#rocsparse-destroy-handle>`_. If users want to change device, they must destroy the current handle and create another rocSPARSE handle.
 
@@ -2693,7 +2693,7 @@ rocsparse_hybmv()
    :project: rocSPARSE
 
 .. doxygenfunction:: rocsparse_chybmv
-   :project: rocSPARSE  
+   :project: rocSPARSE
 
 .. doxygenfunction:: rocsparse_zhybmv
    :project: rocSPARSE
@@ -3033,31 +3033,31 @@ rocSOLVER
 ***************
 
 .. toctree::
-   :maxdepth: 4 
+   :maxdepth: 4
    :caption: Contents:
 
 
 Introduction
 ##############
 
-An implementation of Lapack routines on top of AMD’s Radeon Open Compute Platform (ROCm) runtime and toolchains. 
-rocSOLVER is implemented in the HIP programming language; it is based on rocBLAS, an optimized BLAS 
-implementation for AMD’s latest discrete GPUs. More information about rocBLAS can be found 
+An implementation of Lapack routines on top of AMD's Radeon Open Compute Platform (ROCm) runtime and toolchains.
+rocSOLVER is implemented in the HIP programming language; it is based on rocBLAS, an optimized BLAS
+implementation for AMD's latest discrete GPUs. More information about rocBLAS can be found
 `here <https://rocblas.readthedocs.io/en/latest/index.html>`_.
 
 Build and install
 ###################
 
-rocSOLVER requires `cmake <https://cmake.org/install/>`_ 
-and `ROCm <https://rocm.github.io/install.html>`_, including 
-`hip <https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md>`_ and 
-`rocBLAS <https://github.com/ROCmSoftwarePlatform/rocBLAS>`_, to be installed. 
+rocSOLVER requires `cmake <https://cmake.org/install/>`_
+and `ROCm <https://rocm.github.io/install.html>`_, including
+`hip <https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md>`_ and
+`rocBLAS <https://github.com/ROCmSoftwarePlatform/rocBLAS>`_, to be installed.
 
 Once these requirements are satisfied, the following
 instructions will build and install rocSOLVER:
 
 .. code-block:: bash
-   
+
      mkdir build && cd build
     CXX=/opt/rocm/bin/hcc cmake ..
     make
@@ -3066,56 +3066,56 @@ instructions will build and install rocSOLVER:
 Brief description and functionality
 ######################################
 
-rocSolver Library is in early stages of active development. New features and functionality is being continuosly added. New 
-functionality is documented at each release of the ROCm platform. 
+rocSolver Library is in early stages of active development. New features and functionality is being continuosly added. New
+functionality is documented at each release of the ROCm platform.
 
 The following table summarizes the LAPACK functionality implemented in rocSOLVER's last release.
 
 =============================== ====== ====== ============== ==============
 Lapack Auxiliary Function       single double single complex double complex
 =============================== ====== ====== ============== ==============
-**rocsolver_laswp**             x      x         x              x 
-**rocsolver_larfg**             x      x                        
+**rocsolver_laswp**             x      x         x              x
+**rocsolver_larfg**             x      x
 **rocsolver_larft**             x      x
 **rocsolver_larf**              x      x
-**rocsolver_larfb**             x      x      
-**rocsolver_org2r**             x      x      
-**rocsolver_orgqr**             x      x      
-**rocsolver_orgl2**             x      x      
-**rocsolver_orglq**             x      x      
-**rocsolver_orgbr**             x      x      
-**rocsolver_orm2r**             x      x      
-**rocsolver_ormqr**             x      x      
+**rocsolver_larfb**             x      x
+**rocsolver_org2r**             x      x
+**rocsolver_orgqr**             x      x
+**rocsolver_orgl2**             x      x
+**rocsolver_orglq**             x      x
+**rocsolver_orgbr**             x      x
+**rocsolver_orm2r**             x      x
+**rocsolver_ormqr**             x      x
 =============================== ====== ====== ============== ==============
 
 =============================== ====== ====== ============== ==============
 Lapack Function                 single double single complex double complex
 =============================== ====== ====== ============== ==============
-**rocsolver_potf2**             x      x                        
-rocsolver_potf2_batched         x      x                       
-rocsolver_potf2_strided_batched x      x                       
-**rocsolver_potrf**             x      x                        
-rocsolver_potrf_batched         x      x                       
-rocsolver_potrf_strided_batched x      x                       
+**rocsolver_potf2**             x      x
+rocsolver_potf2_batched         x      x
+rocsolver_potf2_strided_batched x      x
+**rocsolver_potrf**             x      x
+rocsolver_potrf_batched         x      x
+rocsolver_potrf_strided_batched x      x
 **rocsolver_getf2**             x      x          x             x
 rocsolver_getf2_batched         x      x          x             x
 rocsolver_getf2_strided_batched x      x          x             x
-**rocsolver_getrf**             x      x          x             x 
+**rocsolver_getrf**             x      x          x             x
 rocsolver_getrf_batched         x      x          x             x
 rocsolver_getrf_strided_batched x      x          x             x
-**rocsolver_geqr2**             x      x                        
+**rocsolver_geqr2**             x      x
 rocsolver_geqr2_batched         x      x
 rocsolver_geqr2_strided_batched x      x
-**rocsolver_geqrf**             x      x                        
-rocsolver_geqrf_batched         x      x 
+**rocsolver_geqrf**             x      x
+rocsolver_geqrf_batched         x      x
 rocsolver_geqrf_strided_batched x      x
-**rocsolver_gelq2**             x      x                        
+**rocsolver_gelq2**             x      x
 rocsolver_gelq2_batched         x      x
 rocsolver_gelq2_strided_batched x      x
-**rocsolver_gelqf**             x      x                        
-rocsolver_gelqf_batched         x      x 
+**rocsolver_gelqf**             x      x
+rocsolver_gelqf_batched         x      x
 rocsolver_gelqf_strided_batched x      x
-**rocsolver_getrs**             x      x          x             x 
+**rocsolver_getrs**             x      x          x             x
 rocsolver_getrs_batched         x      x          x             x
 rocsolver_getrs_strided_batched x      x          x             x
 =============================== ====== ====== ============== ==============
@@ -3123,38 +3123,38 @@ rocsolver_getrs_strided_batched x      x          x             x
 Benchmarking and Testing
 ##########################
 
-Additionaly, rocSOLVER has a basic/preliminary infrastructure for testing and benchmarking similar to that of rocBLAS. 
+Additionaly, rocSOLVER has a basic/preliminary infrastructure for testing and benchmarking similar to that of rocBLAS.
 
-On a normal installation, clients should be located in the directory **<rocsolverDIR>/build/clients/staging**. 
+On a normal installation, clients should be located in the directory **<rocsolverDIR>/build/clients/staging**.
 
 **rocsolver-test** executes a suite of `Google tests <https://github.com/google/googletest>`_ (*gtest*) that verifies the correct
-functioning of the library; the results computed by rocSOLVER, for random input data, are compared with the results computed by 
+functioning of the library; the results computed by rocSOLVER, for random input data, are compared with the results computed by
 `NETLib LAPACK <http://www.netlib.org/lapack/>`_ on the CPU.
 
 Calling the rocSOLVER gtest client with the --help flag
 
 .. code-block:: bash
-    
+
     ./rocsolver-test --help
 
-returns information on different flags that control the behavior of the gtests.   
+returns information on different flags that control the behavior of the gtests.
 
 **rocsolver-bench** allows to run any rocSOLVER function with random data of the specified dimensions; it compares the computed results, and provides basic
-performance information (as for now, execution times). 
+performance information (as for now, execution times).
 
-Similarly, 
+Similarly,
 
 .. code-block:: bash
-    
+
     ./rocsolver-bench --help
 
-returns information on how to use the rocSOLVER benchmark client.   
- 
+returns information on how to use the rocSOLVER benchmark client.
+
 
 rocSOLVER API
 ###############
 
-This section provides details of the rocSOLVER library API as in release 
+This section provides details of the rocSOLVER library API as in release
 `ROCm 2.10 <https://github.com/ROCmSoftwarePlatform/rocSOLVER/tree/master-rocm-2.10>`_.
 
 
@@ -3162,7 +3162,7 @@ This section provides details of the rocSOLVER library API as in release
 Types
 =====
 
-Most rocSOLVER types are aliases of rocBLAS types. 
+Most rocSOLVER types are aliases of rocBLAS types.
 See rocBLAS types `here <https://rocblas.readthedocs.io/en/latest/api.html#types>`_.
 
 Definitions
@@ -3567,7 +3567,7 @@ rocsolver_<type>getrs_strided_batched()
 Auxiliaries
 =========================
 
-rocSOLVER auxiliary functions are aliases of rocBLAS auxiliary functions. See rocBLAS auxiliary functions 
+rocSOLVER auxiliary functions are aliases of rocBLAS auxiliary functions. See rocBLAS auxiliary functions
 `here <https://rocblas.readthedocs.io/en/latest/api.html#auxiliary>`_.
 
 rocSOLVER handle auxiliaries
@@ -3666,7 +3666,7 @@ The hipSPARSE interface is compatible with rocSPARSE and cuSPARSE-v2 APIs. Porti
 
 CSRMV API
 ###########
- 
+
  ::
 
   hipsparseStatus_t
@@ -3903,7 +3903,7 @@ Common build problems
               ROCBLAS.cmake |br|
               rocblas-config.cmake
 
-   **Solution:** Install `rocBLAS <https://github.com/ROCmSoftwarePlatform/rocBLAS>`_ either from source or from 'AMD ROCm repository <https://rocm.github.io/ROCmInstall.html#installing-from-amd-rocm-repositories>`_ 
+   **Solution:** Install `rocBLAS <https://github.com/ROCmSoftwarePlatform/rocBLAS>`_ either from source or from 'AMD ROCm repository <https://rocm.github.io/ROCmInstall.html#installing-from-amd-rocm-repositories>`_
 
 Simple Test
 ###########
@@ -3918,7 +3918,7 @@ You can test the installation by running a CG solver on a Laplace matrix. After
 
   ./clients/staging/cg gr_30_30.mtx
 
-For more information regarding rocALUTION library and corresponding API documentation, refer 
+For more information regarding rocALUTION library and corresponding API documentation, refer
 `rocALUTION <https://rocalution.readthedocs.io/en/latest/library.html>`_
 
 
@@ -4891,7 +4891,7 @@ Tensile is a **tool** for creating a benchmark-driven backend library for GEMMs,
 
 Overview for creating a custom TensileLib backend library for your application:
 
-1. Install the `PyYAML and cmake dependency`_ (mandatory), ``git clone and cd Tensile`` 
+1. Install the `PyYAML and cmake dependency`_ (mandatory), ``git clone and cd Tensile``
 2. Create a `benchmark config.yaml`_ file in ``./Tensile/Configs/``
 3. `Run the benchmark`_. After the benchmark is finished. Tensile will dump 4 directories: 1 & 2 is about benchmarking. 3 & 4 is the summarized results from your library (like rocBLAS) viewpoints.
 
@@ -4950,7 +4950,7 @@ Tensile uses an incremental and "programmable" `benchmarking protocol`_.
 Example Benchmark config.yaml as input file to Tensile
 -------------------------------------------------------
 
-:: 
+::
 
   GlobalParameters:
     PrintLevel: 1
@@ -5160,18 +5160,18 @@ Each step of the benchmark can override what problem sizes will be benchmarked.
 
  1. [1968]
   * Benchmark only size 1968; n = 1.
-  
+
  2. [16, 1920]
   * Benchmark sizes 16 to 1968 using the default step size (=16); n = 123.
- 
+
  3. [16, 32, 1968]
   * Benchmark sizes 16 to 1968 using a step size of 32; n = 61.
- 
+
  4. [64, 32, 16, 1968]
   * Benchmark sizes from 64 to 1968 with a step size of 32. Also, increase the step size by 16 each iteration.
   * This causes fewer sizes to be benchmarked when the sizes are large, and more benchmarks where the sizes are small; this is 	      	typically desired behavior.
   * n = 16 (64, 96, 144, 208, 288, 384, 496, 624, 768, 928, 1104, 1296, 1504, 1728, 1968). The stride at the beginning is 32, but the stride at the end is 256.
- 
+
  5. 0
   * The size of this index is just whatever size index 0 is. For a 3-dimensional ProblemType, this allows benchmarking only a 2- 	      	dimensional or 1-dimensional slice of problem sizes.
 
@@ -5255,12 +5255,12 @@ Compilers
 --------------
 
   * For Tensile_BACKEND = OpenCL1.2 *(untested)*
-      
+
       * Visual Studio 14 (2015). (VS 2012 may also be supported; c++11 should no longer be required by Tensile. Need to verify.)
       * GCC 4.8 and above
 
   * For Tensile_BACKEND = HIP
-      
+
       * Public ROCm
 
 
@@ -5273,7 +5273,7 @@ Tensile can be installed via:
 1. Download repo and don't install; install PyYAML dependency manually and call python scripts manually:
 
 ::
- 
+
   git clone https://github.com/ROCmSoftwarePlatform/Tensile.git
   python Tensile/Tensile/Tensile.py your_custom_config.yaml your_benchmark_path
 
@@ -5329,7 +5329,7 @@ The kernel parameters affect many aspects of performance. Changing a parameter m
 
  .. image:: img1.png
      :align: center
-   
+
 How N-Dimensional Tensor Contractions Are Mapped to Finite-Dimensional GPU Kernels
 --------------------------------------------------------------------------------------
 
@@ -5372,7 +5372,7 @@ The device languages Tensile supports for the gpu kernels is
 * HIP
 * Assembly
 
-	* gfx803 
+	* gfx803
 	* gfx900
 
 Library Logic
@@ -5455,7 +5455,7 @@ After running the `benchmark`_ and generating `library config files`_, you're re
     )
   target_link_libraries( TARGET Tensile )
 
-TODO: Where is the Tensile include directory?	
+TODO: Where is the Tensile include directory?
 
 .. _benchmark: https://rocm-documentation.readthedocs.io/en/latest/ROCm_Libraries/ROCm_Libraries.html#id39
 .. _library config files: https://rocm-documentation.readthedocs.io/en/latest/ROCm_Libraries/ROCm_Libraries.html#id46
@@ -5668,7 +5668,7 @@ In order to verify the build and capability of ROCm SMI on your system and to se
   $ cmake -DROCM_DIR=<location of ROCM SMI library .so> <ROCm SMI source root>/tests/rocm_smi_test
   $ make
 
-To run the test, execute the program rsmitst that is built from the steps above. 
+To run the test, execute the program rsmitst that is built from the steps above.
 
 Usage Basics
 ##############
@@ -5691,25 +5691,25 @@ A simple "Hello World" type program that displays the device ID of detected devi
   #include <stdint.h>
   #include "rocm_smi/rocm_smi.h"
   int main() {
-  rsmi_status_t ret; 
-  uint32_t num_devices; 
-  uint64_t dev_id; 
- 
-  // We will skip return code checks for this example, but it 
+  rsmi_status_t ret;
+  uint32_t num_devices;
+  uint64_t dev_id;
+
+  // We will skip return code checks for this example, but it
   // is recommended to always check this as some calls may not
   // apply for some devices or ROCm releases
- 
+
   ret = rsmi_init(0);
   ret = rsmi_num_monitor_devices(&num_devices);
- 
+
   for (int i=0; i < num_devices; ++i) {
     ret = rsmi_dev_id_get(i, &dev_id);
     // dev_id holds the device ID of device i, upon a
-    // successful call  
-  }  
+    // successful call
+  }
   ret = rsmi_shut_down();
   return 0;
-  } 
+  }
 
 *****
 RCCL
@@ -5761,7 +5761,7 @@ To build the library :
   $ cd rccl
   $ mkdir build
   $ cd build
-  $ CXX=/opt/rocm/bin/hcc cmake 
+  $ CXX=/opt/rocm/bin/hcc cmake
   $ make -j 8
 
 
@@ -5769,7 +5769,7 @@ To build the library :
 You may substitute a path of your own choosing for CMAKE_INSTALL_PREFIX. Note: ensure rocm-cmake is installed,
 
 ::
-  
+
   apt install rocm-cmake.
 
 
@@ -5867,7 +5867,7 @@ Build And Install
   # before 'cmake' or setting cmake option 'CMAKE_CXX_COMPILER' to path to the HCC compiler.
   #
   [CXX=hcc] cmake ../. # or cmake-gui ../.
- 
+
   # Build
   make -j4
 
@@ -5886,7 +5886,7 @@ Using hipCUB In A Project
 ###########################
 
 Recommended way of including hipCUB into a CMake project is by using its package configuration files.
- 
+
 ::
 
   # On ROCm hipCUB requires rocPRIM
@@ -5997,7 +5997,7 @@ First create a build directory:
 
 ::
 
-  mkdir build; 
+  mkdir build;
   cd build;
 
 
@@ -6119,7 +6119,7 @@ Deprecated Libraries
 hCRNG
 ######
 
-hCRNG has been **deprecated** and has been replaced by `rocRAND <https://github.com/ROCmSoftwarePlatform/rocRAND>`_ 
+hCRNG has been **deprecated** and has been replaced by `rocRAND <https://github.com/ROCmSoftwarePlatform/rocRAND>`_
 
 The hcRNG library is an implementation of uniform random number generators targeting the AMD heterogeneous hardware via HCC compiler runtime. The computational resources of underlying AMD heterogenous compute gets exposed and exploited through the HCC C++ frontend. Refer `here <https://rocm-documentation.readthedocs.io/en/latest/ROCm_Tools/ROCm-Tools.html#hcc>`_ for more details on HCC compiler.
 
@@ -6145,7 +6145,7 @@ For more information, please refer :ref:`CLFF`
 clBLAS
 ########
 
-This repository houses the code for the OpenCL™ BLAS portion of clMath. The complete set of BLAS level 1, 2 & 3 routines is implemented. Please see Netlib BLAS for the list of supported routines. In addition to GPU devices, the library also supports running on CPU devices to facilitate debugging and multicore programming. APPML 1.12 is the most current generally available pre-packaged binary version of the library available for download for both Linux and Windows platforms.
+This repository houses the code for the OpenCL(TM) BLAS portion of clMath. The complete set of BLAS level 1, 2 & 3 routines is implemented. Please see Netlib BLAS for the list of supported routines. In addition to GPU devices, the library also supports running on CPU devices to facilitate debugging and multicore programming. APPML 1.12 is the most current generally available pre-packaged binary version of the library available for download for both Linux and Windows platforms.
 
 The primary goal of clBLAS is to make it easier for developers to utilize the inherent performance and power efficiency benefits of heterogeneous computing. clBLAS interfaces do not hide nor wrap OpenCL interfaces, but rather leaves OpenCL state management to the control of the user to allow for maximum performance and flexibility. The clBLAS library does generate and enqueue optimized OpenCL kernels, relieving the user from the task of writing, optimizing and maintaining kernel code themselves.
 
@@ -6154,18 +6154,18 @@ For more information, please refer :ref:`CLB`
 
 clSPARSE
 #########
- 
-an OpenCL™ library implementing Sparse linear algebra routines. This project is a result of a collaboration between `AMD Inc. <http://www.amd.com/en>`_ and `Vratis Ltd. <http://www.vratis.com/>`_.
+
+an OpenCL(TM) library implementing Sparse linear algebra routines. This project is a result of a collaboration between `AMD Inc. <http://www.amd.com/en>`_ and `Vratis Ltd. <http://www.vratis.com/>`_.
 
 For more information, please refer :ref:`CLS`
 
 
 clRNG
 ########
- 
+
 A library for uniform random number generation in OpenCL.
 
-Streams of random numbers act as virtual random number generators. They can be created on the host computer in unlimited numbers, and then used either on the host or on computing devices by work items to generate random numbers. Each stream also has equally-spaced substreams, which are occasionally useful. The API is currently implemented for four different RNGs, namely the MRG31k3p, MRG32k3a, LFSR113 and Philox-4×32-10 generators.
+Streams of random numbers act as virtual random number generators. They can be created on the host computer in unlimited numbers, and then used either on the host or on computing devices by work items to generate random numbers. Each stream also has equally-spaced substreams, which are occasionally useful. The API is currently implemented for four different RNGs, namely the MRG31k3p, MRG32k3a, LFSR113 and Philox-4x32-10 generators.
 
 For more information, please refer :ref:`CLR`
 
diff --git a/ROCm_Libraries/dep-lib.rst b/ROCm_Libraries/dep-lib.rst
index b8fb719d..39e6623e 100644
--- a/ROCm_Libraries/dep-lib.rst
+++ b/ROCm_Libraries/dep-lib.rst
@@ -4,7 +4,7 @@
 hcRNG
 **********
 
-hCRNG has been deprecated and has been replaced by `rocRAND <https://github.com/ROCmSoftwarePlatform/rocRAND>`_ 
+hCRNG has been deprecated and has been replaced by `rocRAND <https://github.com/ROCmSoftwarePlatform/rocRAND>`_
 #################################################################################################################
 
 Introduction
@@ -30,7 +30,7 @@ file: Randomarray.cpp
 ::
 
   #!c++
-  
+
   //This example is a simple random array generation and it compares host output with device output
   //Random number generator Mrg31k3p
   #include <stdio.h>
@@ -43,7 +43,7 @@ file: Randomarray.cpp
   #include <hc.hpp>
   #include <hc_am.hpp>
   using namespace hc;
- 
+
   int main()
   {
         hcrngStatus status = HCRNG_SUCCESS;
@@ -53,7 +53,7 @@ file: Randomarray.cpp
         size_t streamCount = 10;
         //Number of random numbers to be generated
         //numberCount must be a multiple of streamCount
-        size_t numberCount = 100; 
+        size_t numberCount = 100;
         //Enumerate the list of accelerators
         std::vector<hc::accelerator>acc = hc::accelerator::get_all();
         accelerator_view accl_view = (acc[1].create_view());
@@ -61,21 +61,21 @@ file: Randomarray.cpp
         float *Random1 = (float*) malloc(sizeof(float) * numberCount);
         float *Random2 = (float*) malloc(sizeof(float) * numberCount);
         float *outBufferDevice = hc::am_alloc(sizeof(float) * numberCount, acc[1], 0);
- 
+
         //Create streams
         hcrngMrg31k3pStream *streams = hcrngMrg31k3pCreateStreams(NULL, streamCount, &streamBufferSize, NULL);
         hcrngMrg31k3pStream *streams_buffer = hc::am_alloc(sizeof(hcrngMrg31k3pStream) * streamCount, acc[1], 0);
         accl_view.copy(streams, streams_buffer, streamCount* sizeof(hcrngMrg31k3pStream));
- 
-        //Invoke random number generators in device (here strean_length and streams_per_thread arguments are default) 
+
+        //Invoke random number generators in device (here strean_length and streams_per_thread arguments are default)
         status = hcrngMrg31k3pDeviceRandomU01Array_single(accl_view, streamCount, streams_buffer, numberCount, outBufferDevice);
- 
+
         if(status) std::cout << "TEST FAILED" << std::endl;
         accl_view.copy(outBufferDevice, Random1, numberCount * sizeof(float));
- 
+
         //Invoke random number generators in host
         for (size_t i = 0; i < numberCount; i++)
-          Random2[i] = hcrngMrg31k3pRandomU01(&streams[i % streamCount]);   
+          Random2[i] = hcrngMrg31k3pRandomU01(&streams[i % streamCount]);
         // Compare host and device outputs
         for(int i =0; i < numberCount; i++) {
             if (Random1[i] != Random2[i]) {
@@ -87,7 +87,7 @@ file: Randomarray.cpp
                 continue;
         }
         if(!ispassed) std::cout << "TEST FAILED" << std::endl;
- 
+
         //Free host resources
         free(Random1);
         free(Random2);
@@ -95,8 +95,8 @@ file: Randomarray.cpp
         hc::am_free(outBufferDevice);
         hc::am_free(streams_buffer);
         return 0;
-  }  
- 
+  }
+
 
 * Compiling the example code:
 
@@ -141,8 +141,8 @@ and **Reboot the system**
 
 Once Reboot, to verify that the ROCm stack completed successfully you can execute HSA vector_copy sample application:
 ::
-  cd /opt/rocm/hsa/sample        
-  make       
+  cd /opt/rocm/hsa/sample
+  make
   ./vector_copy
 
 **Library Installation**
@@ -150,14 +150,14 @@ Once Reboot, to verify that the ROCm stack completed successfully you can execut
 **a. Install using Prebuilt debian**
 
 ::
-  
+
   wget https://github.com/ROCmSoftwarePlatform/hcRNG/blob/master/pre-builds/hcrng-master-184472e-Linux.deb
   sudo dpkg -i hcrng-master-184472e-Linux.deb
 
 **b. Build debian from source**
 
 ::
-  
+
   git clone https://github.com/ROCmSoftwarePlatform/hcRNG.git && cd hcRNG
   chmod +x build.sh && ./build.sh
 
@@ -286,7 +286,7 @@ AMD is hosting both debian and rpm repositories for the ROCm 2.7 packages. The p
 
 Complete installation steps of ROCm can be found `Here <https://rocm-documentation.readthedocs.io/en/latest/Installation_Guide/Installation-Guide.html>`_
 
-or 
+or
 
 For Debian based systems, like Ubuntu, configure the Debian ROCm repository as follows:
 
@@ -519,7 +519,7 @@ Build dependencies
 
 To develop the clFFT library code on a Windows operating system, ensure to install the following packages on your system:
 
- * Windows® 7/8.1
+ * Windows(R) 7/8.1
 
  * Visual Studio 2012 or later
 
@@ -548,7 +548,7 @@ To test the developed clFFT library code, ensure to install the following packag
  * Googletest v1.6
 
  * Latest FFTW
- 
+
  * Latest Boost
 
 Performance infrastructure
@@ -565,7 +565,7 @@ clBLAS
 
 For Github repository `clBLAS <https://github.com/clMathLibraries/clBLAS>`_
 
-This repository houses the code for the OpenCL™ BLAS portion of clMath. The complete set of BLAS level 1, 2 & 3 routines is implemented. Please see Netlib BLAS for the list of supported routines. In addition to GPU devices, the library also supports running on CPU devices to facilitate debugging and multicore programming. APPML 1.12 is the most current generally available pre-packaged binary version of the library available for download for both Linux and Windows platforms.
+This repository houses the code for the OpenCL(TM) BLAS portion of clMath. The complete set of BLAS level 1, 2 & 3 routines is implemented. Please see Netlib BLAS for the list of supported routines. In addition to GPU devices, the library also supports running on CPU devices to facilitate debugging and multicore programming. APPML 1.12 is the most current generally available pre-packaged binary version of the library available for download for both Linux and Windows platforms.
 
 The primary goal of clBLAS is to make it easier for developers to utilize the inherent performance and power efficiency benefits of heterogeneous computing. clBLAS interfaces do not hide nor wrap OpenCL interfaces, but rather leaves OpenCL state management to the control of the user to allow for maximum performance and flexibility. The clBLAS library does generate and enqueue optimized OpenCL kernels, relieving the user from the task of writing, optimizing and maintaining kernel code themselves.
 
@@ -716,7 +716,7 @@ Build dependencies
 ##########################################
 **Library for Windows**
 
- * Windows® 7/8
+ * Windows(R) 7/8
  * Visual Studio 2010 SP1, 2012
  * An OpenCL SDK, such as APP SDK 2.8
  * Latest CMake
@@ -749,10 +749,10 @@ Python
 **************
 clSPARSE
 **************
- 
+
 For Github repository `clSPARSE <https://github.com/clMathLibraries/clSPARSE>`_
 
-an OpenCL™ library implementing Sparse linear algebra routines. This project is a result of a collaboration between `AMD Inc. <http://www.amd.com/en>`_ and `Vratis Ltd. <http://www.vratis.com/>`_.
+an OpenCL(TM) library implementing Sparse linear algebra routines. This project is a result of a collaboration between `AMD Inc. <http://www.amd.com/en>`_ and `Vratis Ltd. <http://www.vratis.com/>`_.
 
 What's new in clSPARSE v0.10.1
 ###################################
@@ -779,7 +779,7 @@ clSPARSE features
  * Dense to CSR conversions (& converse)
  * COO to CSR conversions (& converse)
  * Functions to read matrix market files in COO or CSR format
-True in spirit with the other clMath libraries, clSPARSE exports a “C” interface to allow projects to build wrappers around clSPARSE in any language they need. A great deal of thought and effort went into designing the API’s to make them less ‘cluttered’ compared to the older clMath libraries. OpenCL state is not explicitly passed through the API, which enables the library to be forward compatible when users are ready to switch from OpenCL 1.2 to OpenCL 2.0 3
+True in spirit with the other clMath libraries, clSPARSE exports a "C" interface to allow projects to build wrappers around clSPARSE in any language they need. A great deal of thought and effort went into designing the API's to make them less 'cluttered' compared to the older clMath libraries. OpenCL state is not explicitly passed through the API, which enables the library to be forward compatible when users are ready to switch from OpenCL 1.2 to OpenCL 2.0 3
 
 API semantic versioning
 ##############################
@@ -808,7 +808,7 @@ clSPARSE is licensed under the `Apache License <http://www.apache.org/licenses/L
 
 **Compiling for Windows**
 
- * Windows® 7/8
+ * Windows(R) 7/8
  * Visual Studio 2013 and above
  * CMake 2.8.12 (download from `Kitware <http://www.cmake.org/download/>`_)
  * Solution (.sln) or
@@ -850,12 +850,12 @@ clSPARSE is licensed under the `Apache License <http://www.apache.org/licenses/L
 ****************
 clRNG
 ****************
- 
+
 For Github repository `clRNG <https://github.com/clMathLibraries/clRNG>`_
 
 A library for uniform random number generation in OpenCL.
 
-Streams of random numbers act as virtual random number generators. They can be created on the host computer in unlimited numbers, and then used either on the host or on computing devices by work items to generate random numbers. Each stream also has equally-spaced substreams, which are occasionally useful. The API is currently implemented for four different RNGs, namely the MRG31k3p, MRG32k3a, LFSR113 and Philox-4×32-10 generators.
+Streams of random numbers act as virtual random number generators. They can be created on the host computer in unlimited numbers, and then used either on the host or on computing devices by work items to generate random numbers. Each stream also has equally-spaced substreams, which are occasionally useful. The API is currently implemented for four different RNGs, namely the MRG31k3p, MRG32k3a, LFSR113 and Philox-4x32-10 generators.
 
 
 What's New
@@ -873,7 +873,7 @@ Building
 ##############
  1. Install the runtime dependency:
       * An OpenCL SDK, such as APP SDK.
- 
+
  2. Install the build dependencies:
 
      * The CMake cross-platform build system. Visual Studio users can use CMake Tools for Visual Studio.
@@ -906,7 +906,7 @@ On a 64-bit Linux platform, steps 3 through 9 from above, executed in a Bash-com
   export CLRNG_ROOT=$PWD/package
   export LD_LIBRARY_PATH=$CLRNG_ROOT/lib64:$LD_LIBRARY_PATH
   $CLRNG_ROOT/bin/CTest
-  
+
 **Examples**
 
 Examples can be found in src/client. The compiled client program examples can be found under the bin subdirectory of the installation package ($CLRNG_ROOT/bin under Linux). Note that the examples expect an OpenCL GPU device to be available.
@@ -1047,7 +1047,7 @@ The following are the steps to use the library
 
 **ROCM 2.7 Installation**
 
-To Know more about ROCM refer 
+To Know more about ROCM refer
 https://github.com/RadeonOpenCompute/ROCm/blob/master/README.md
 
 **a. Installing Debian ROCM repositories**
@@ -1083,8 +1083,8 @@ and Reboot the system
 
 Once Reboot, to verify that the ROCm stack completed successfully you can execute HSA vector_copy sample application:
 
-   * cd /opt/rocm/hsa/sample        
-   * make       
+   * cd /opt/rocm/hsa/sample
+   * make
    * ./vector_copy
 
 **Library Installation**
@@ -1129,7 +1129,7 @@ The following are the sub-routines that are implemented
 
 KeyFeature
 #############
- 
+
  * Support 1D, 2D and 3D Fast Fourier Transforms
  * Supports R2C, C2R, C2C, D2Z, Z2D and Z2Z Transforms
  * Support Out-Of-Place data storage
@@ -1145,7 +1145,7 @@ This section lists the known set of hardware and software requirements to build
 **Hardware**
 
 
- * CPU: mainstream brand, Better if with >=4 Cores Intel Haswell based CPU 
+ * CPU: mainstream brand, Better if with >=4 Cores Intel Haswell based CPU
  * System Memory >= 4GB (Better if >10GB for NN application over multiple GPUs)
  * Hard Drive > 200GB (Better if SSD or NVMe driver for NN application over multiple GPUs)
  * Minimum GPU Memory (Global) > 2GB
@@ -1197,7 +1197,7 @@ file: hcfft_1D_R2C.cpp
 ::
 
   #!c++
-  
+
   #include <iostream>
   #include <cstdlib>
   #include "hcfft.h"
@@ -1239,9 +1239,9 @@ file: hcfft_1D_R2C.cpp
     free(input);
     free(output);
     hc::am_free(idata);
-    hc::am_free(odata); 
+    hc::am_free(odata);
   }
- 
+
 * Compiling the example code:
 
 Assuming the library and compiler installation is followed as in installation.
@@ -1264,9 +1264,9 @@ This sections enumerates the list of tested combinations of Hardware and system
 
 **GPU Cards**
 
- * Radeon R9 Nano 
+ * Radeon R9 Nano
  * Radeon R9 FuryX
- * Radeon R9 Fury 
+ * Radeon R9 Fury
  * Kaveri and Carizo APU
 
 **Server System**
diff --git a/ROCm_Libraries/hipsparse_wiki.rst b/ROCm_Libraries/hipsparse_wiki.rst
index 02aef504..76dced9c 100644
--- a/ROCm_Libraries/hipsparse_wiki.rst
+++ b/ROCm_Libraries/hipsparse_wiki.rst
@@ -272,7 +272,7 @@ Exported sparse BLAS functions
 hipSPARSE includes the following auxiliary functions
 
  +------------------------------+
- |  Function name               |   
+ |  Function name               |
  +==============================+
  |  hipsparseCreate             |
  +------------------------------+
@@ -322,86 +322,86 @@ hipSPARSE includes the following auxiliary functions
  +------------------------------+
  |  hipsparseCreateCsrilu02Info |
  +------------------------------+
- 
- 
- 
+
+
+
 
 hipSPARSE includes the following Level 1, 2 and conversion functions
 #######################################################################
- 
+
 **Level 1**
 
-================  ==========   =========  ================  =================  ====== 
+================  ==========   =========  ================  =================  ======
 Function 	   single 	double 	   single complex    double complex 	half
 ================  ==========   =========  ================  =================  ======
-hipsparseXaxpyi       x	           x 	         			
-hipsparseXdoti 	      x	           x 			
-hipsparseXgthr        x	           x 			
-hipsparseXgthrz       x	           x 	 			
-hipsparseXroti        x            x 			
-hipsparseXsctr 	      x	           x 			
+hipsparseXaxpyi       x	           x
+hipsparseXdoti 	      x	           x
+hipsparseXgthr        x	           x
+hipsparseXgthrz       x	           x
+hipsparseXroti        x            x
+hipsparseXsctr 	      x	           x
 ================  ==========   =========  ================  =================  ======
 
 **Level 2**
 
-================================  ==========   =========  ================  =================  ====== 
+================================  ==========   =========  ================  =================  ======
 Function 	                    single 	double 	   single complex    double complex 	half
 ================================  ==========   =========  ================  =================  ======
-hipsparseXcsrmv 	               x 	   x 			
-hipsparseXcsrsv2_bufferSize 	       x 	   x 			
-hipsparseXcsrsv2_bufferSizeExt 	       x 	   x 			
-hipsparseXcsrsv2_analysis 	       x 	   x 			
-hipsparseXcsrsv2_solve 	               x 	   x 			
-hipsparseXhybmv 	               x 	   x 			
+hipsparseXcsrmv 	               x 	   x
+hipsparseXcsrsv2_bufferSize 	       x 	   x
+hipsparseXcsrsv2_bufferSizeExt 	       x 	   x
+hipsparseXcsrsv2_analysis 	       x 	   x
+hipsparseXcsrsv2_solve 	               x 	   x
+hipsparseXhybmv 	               x 	   x
 ================================  ==========   =========  ================  =================  ======
 
 
 **Level 3**
 
-================================  ==========   =========  ================  =================  ====== 
+================================  ==========   =========  ================  =================  ======
 Function 	                    single 	double 	   single complex    double complex 	half
 ================================  ==========   =========  ================  =================  ======
-hipsparseXcsrmm 	              x 	  x 			
-hipsparseXcsrmm2 	              x 	  x 			
+hipsparseXcsrmm 	              x 	  x
+hipsparseXcsrmm2 	              x 	  x
 ================================  ==========   =========  ================  =================  ======
 
 **Extra**
 
-================================  ==========   =========  ================  =================  ====== 
+================================  ==========   =========  ================  =================  ======
 Function 	                    single 	double 	   single complex    double complex 	halfy
 ================================  ==========   =========  ================  =================  ======
-hipsparseXcsrgemmNnz	 	              
-hipsparseXcsrgemm	              x 	  x 			
-hipsparseXcsrgemm2_bufferSizeExt	
+hipsparseXcsrgemmNnz
+hipsparseXcsrgemm	              x 	  x
+hipsparseXcsrgemm2_bufferSizeExt
 hipsparseXcsrgemm2Nnz
 hipsparseXcsrgemm2
 ================================  ==========   =========  ================  =================  ======
 **Preconditioners**
 
-=================================  ==========   =========  ================  =================  ====== 
+=================================  ==========   =========  ================  =================  ======
 Function 	                    single 	 double     single complex     double complex 	 half
 =================================  ==========   =========  ================  =================  ======
-hipsparseXcsrilu02_bufferSize 	       x 	    x 			
-hipsparseXcsrilu02_bufferSizeExt       x 	    x 			
-hipsparseXcsrilu02_analysis 	       x 	    x 		
-hipsparseXcsrilu02 		       x 	    x 	
+hipsparseXcsrilu02_bufferSize 	       x 	    x
+hipsparseXcsrilu02_bufferSizeExt       x 	    x
+hipsparseXcsrilu02_analysis 	       x 	    x
+hipsparseXcsrilu02 		       x 	    x
 =================================  ==========   =========  ================  =================  ======
 
 **Conversion**
 
-====================================  ==========   =========  ================  =================  ====== 
+====================================  ==========   =========  ================  =================  ======
 Function 	                        single 	     double    single complex    double complex     half
 ====================================  ==========   =========  ================  =================  ======
-hipsparseXcsr2coo 					
-hipsparseXcsr2csc 	                  x 	       x 			
-hipsparseXcsr2hyb 	                  x 	       x 			
-hipsparseXcoo2csr 					
-hipsparseCreateIdentityPermutation 					
-hipsparseXcsrsort_bufferSizeExt 					
-hipsparseXcsrsort 					
-hipsparseXcoosort_bufferSizeExt 					
-hipsparseXcoosortByRow 					
-hipsparseXcoosortByColumn 					
+hipsparseXcsr2coo
+hipsparseXcsr2csc 	                  x 	       x
+hipsparseXcsr2hyb 	                  x 	       x
+hipsparseXcoo2csr
+hipsparseCreateIdentityPermutation
+hipsparseXcsrsort_bufferSizeExt
+hipsparseXcsrsort
+hipsparseXcoosort_bufferSizeExt
+hipsparseXcoosortByRow
+hipsparseXcoosortByColumn
 ====================================  ==========   =========  ================  =================  ======
 
 Additional notes
diff --git a/ROCm_Libraries/rocALUTION/Doxyfile b/ROCm_Libraries/rocALUTION/Doxyfile
index d8cad4ba..adf10b2e 100644
--- a/ROCm_Libraries/rocALUTION/Doxyfile
+++ b/ROCm_Libraries/rocALUTION/Doxyfile
@@ -163,7 +163,7 @@ FULL_PATH_NAMES        = YES
 # will be relative from the directory where doxygen is started.
 # This tag requires that the tag FULL_PATH_NAMES is set to YES.
 
-STRIP_FROM_PATH        = 
+STRIP_FROM_PATH        =
 
 # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
 # path mentioned in the documentation of a class, which tells the reader which
@@ -172,7 +172,7 @@ STRIP_FROM_PATH        =
 # specify the list of include paths that are normally passed to the compiler
 # using the -I flag.
 
-STRIP_FROM_INC_PATH    = 
+STRIP_FROM_INC_PATH    =
 
 # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
 # less readable) file names. This can be useful is your file systems doesn't
@@ -239,13 +239,13 @@ TAB_SIZE               = 4
 # "Side Effects:". You can put \n's in the value part of an alias to insert
 # newlines.
 
-ALIASES                = 
+ALIASES                =
 
 # This tag can be used to specify a number of word-keyword mappings (TCL only).
 # A mapping has the form "name=value". For example adding "class=itcl::class"
 # will allow you to use the command class in the itcl::class meaning.
 
-TCL_SUBST              = 
+TCL_SUBST              =
 
 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
@@ -292,7 +292,7 @@ OPTIMIZE_OUTPUT_VHDL   = NO
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
 # the files are not read by doxygen.
 
-EXTENSION_MAPPING      = 
+EXTENSION_MAPPING      =
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
@@ -642,7 +642,7 @@ GENERATE_DEPRECATEDLIST= YES
 # sections, marked by \if <section_label> ... \endif and \cond <section_label>
 # ... \endcond blocks.
 
-ENABLED_SECTIONS       = 
+ENABLED_SECTIONS       =
 
 # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
 # initial value of a variable or macro / define can have for it to appear in the
@@ -684,7 +684,7 @@ SHOW_NAMESPACES        = YES
 # by doxygen. Whatever the program writes to standard output is used as the file
 # version. For an example see the documentation.
 
-FILE_VERSION_FILTER    = 
+FILE_VERSION_FILTER    =
 
 # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
 # by doxygen. The layout file controls the global structure of the generated
@@ -697,7 +697,7 @@ FILE_VERSION_FILTER    =
 # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
 # tag is left empty.
 
-LAYOUT_FILE            = 
+LAYOUT_FILE            =
 
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
@@ -767,7 +767,7 @@ WARN_FORMAT            = "$file:$line: $text"
 # messages should be written. If left blank the output is written to standard
 # error (stderr).
 
-WARN_LOGFILE           = 
+WARN_LOGFILE           =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the input files
@@ -797,7 +797,7 @@ INPUT                  = ROCm_Libraries/rocALUTION/src/modules.dox \
                          ROCm_Libraries/rocALUTION/src/solvers/direct \
                          ROCm_Libraries/rocALUTION/src/solvers/multigrid \
                          ROCm_Libraries/rocALUTION/src/solvers/preconditioners
-                              
+
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -878,7 +878,7 @@ RECURSIVE              = NO
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                = 
+EXCLUDE                =
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -894,7 +894,7 @@ EXCLUDE_SYMLINKS       = NO
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*
 
-EXCLUDE_PATTERNS       = 
+EXCLUDE_PATTERNS       =
 
 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
@@ -905,13 +905,13 @@ EXCLUDE_PATTERNS       =
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories use the pattern */test/*
 
-EXCLUDE_SYMBOLS        = 
+EXCLUDE_SYMBOLS        =
 
 # The EXAMPLE_PATH tag can be used to specify one or more files or directories
 # that contain example code fragments that are included (see the \include
 # command).
 
-EXAMPLE_PATH           = 
+EXAMPLE_PATH           =
 
 # If the value of the EXAMPLE_PATH tag contains directories, you can use the
 # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
@@ -931,7 +931,7 @@ EXAMPLE_RECURSIVE      = NO
 # that contain images that are to be included in the documentation (see the
 # \image command).
 
-IMAGE_PATH             = 
+IMAGE_PATH             =
 
 # The INPUT_FILTER tag can be used to specify a program that doxygen should
 # invoke to filter for each input file. Doxygen will invoke the filter program
@@ -948,7 +948,7 @@ IMAGE_PATH             =
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
 
-INPUT_FILTER           = 
+INPUT_FILTER           =
 
 # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
 # basis. Doxygen will compare the file name with each pattern and apply the
@@ -957,7 +957,7 @@ INPUT_FILTER           =
 # filters are used. If the FILTER_PATTERNS tag is empty or if none of the
 # patterns match the file name, INPUT_FILTER is applied.
 
-FILTER_PATTERNS        = 
+FILTER_PATTERNS        =
 
 # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
 # INPUT_FILTER) will also be used to filter the input files that are used for
@@ -972,7 +972,7 @@ FILTER_SOURCE_FILES    = NO
 # *.ext= (so without naming a filter).
 # This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
 
-FILTER_SOURCE_PATTERNS = 
+FILTER_SOURCE_PATTERNS =
 
 # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
 # is part of the input, its contents will be placed on the main page
@@ -1084,7 +1084,7 @@ CLANG_ASSISTED_PARSING = NO
 # specified with INPUT and INCLUDE_PATH.
 # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
 
-CLANG_OPTIONS          = 
+CLANG_OPTIONS          =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
@@ -1110,7 +1110,7 @@ COLS_IN_ALPHA_INDEX    = 5
 # while generating the index headers.
 # This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
 
-IGNORE_PREFIX          = 
+IGNORE_PREFIX          =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the HTML output
@@ -1155,7 +1155,7 @@ HTML_FILE_EXTENSION    = .html
 # of the possible markers and block names see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_HEADER            = 
+HTML_HEADER            =
 
 # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
 # generated HTML page. If the tag is left blank doxygen will generate a standard
@@ -1165,7 +1165,7 @@ HTML_HEADER            =
 # that doxygen normally uses.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_FOOTER            = 
+HTML_FOOTER            =
 
 # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
 # sheet that is used by each HTML page. It can be used to fine-tune the look of
@@ -1177,7 +1177,7 @@ HTML_FOOTER            =
 # obsolete.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_STYLESHEET        = 
+HTML_STYLESHEET        =
 
 # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # cascading style sheets that are included after the standard style sheets
@@ -1190,7 +1190,7 @@ HTML_STYLESHEET        =
 # list). For an example see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_EXTRA_STYLESHEET  = 
+HTML_EXTRA_STYLESHEET  =
 
 # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the HTML output directory. Note
@@ -1200,7 +1200,7 @@ HTML_EXTRA_STYLESHEET  =
 # files will be copied as-is; there are no commands or markers available.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_EXTRA_FILES       = 
+HTML_EXTRA_FILES       =
 
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
@@ -1329,7 +1329,7 @@ GENERATE_HTMLHELP      = NO
 # written to the html output directory.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-CHM_FILE               = 
+CHM_FILE               =
 
 # The HHC_LOCATION tag can be used to specify the location (absolute path
 # including file name) of the HTML help compiler (hhc.exe). If non-empty,
@@ -1337,7 +1337,7 @@ CHM_FILE               =
 # The file has to be specified with full path.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-HHC_LOCATION           = 
+HHC_LOCATION           =
 
 # The GENERATE_CHI flag controls if a separate .chi index file is generated
 # (YES) or that it should be included in the master .chm file (NO).
@@ -1350,7 +1350,7 @@ GENERATE_CHI           = NO
 # and project file content.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-CHM_INDEX_ENCODING     = 
+CHM_INDEX_ENCODING     =
 
 # The BINARY_TOC flag controls whether a binary table of contents is generated
 # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
@@ -1381,7 +1381,7 @@ GENERATE_QHP           = NO
 # the HTML output folder.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QCH_FILE               = 
+QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
@@ -1406,7 +1406,7 @@ QHP_VIRTUAL_FOLDER     = doc
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_CUST_FILTER_NAME   = 
+QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
@@ -1414,21 +1414,21 @@ QHP_CUST_FILTER_NAME   =
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_CUST_FILTER_ATTRS  = 
+QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
 # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_SECT_FILTER_ATTRS  = 
+QHP_SECT_FILTER_ATTRS  =
 
 # The QHG_LOCATION tag can be used to specify the location of Qt's
 # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
 # generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHG_LOCATION           = 
+QHG_LOCATION           =
 
 # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
 # generated, together with the HTML files, they form an Eclipse help plugin. To
@@ -1561,7 +1561,7 @@ MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
 # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_EXTENSIONS     = 
+MATHJAX_EXTENSIONS     =
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
@@ -1569,7 +1569,7 @@ MATHJAX_EXTENSIONS     =
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_CODEFILE       = 
+MATHJAX_CODEFILE       =
 
 # When the SEARCHENGINE tag is enabled doxygen will generate a search box for
 # the HTML output. The underlying search engine uses javascript and DHTML and
@@ -1629,7 +1629,7 @@ EXTERNAL_SEARCH        = NO
 # Searching" for details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-SEARCHENGINE_URL       = 
+SEARCHENGINE_URL       =
 
 # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
 # search data is written to a file for indexing by an external tool. With the
@@ -1645,7 +1645,7 @@ SEARCHDATA_FILE        = searchdata.xml
 # projects and redirect the results back to the right project.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-EXTERNAL_SEARCH_ID     = 
+EXTERNAL_SEARCH_ID     =
 
 # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
 # projects other than the one defined by this configuration file, but that are
@@ -1655,7 +1655,7 @@ EXTERNAL_SEARCH_ID     =
 # EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-EXTRA_SEARCH_MAPPINGS  = 
+EXTRA_SEARCH_MAPPINGS  =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the LaTeX output
@@ -1719,7 +1719,7 @@ PAPER_TYPE             = a4
 # If left blank no extra packages will be included.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-EXTRA_PACKAGES         = 
+EXTRA_PACKAGES         =
 
 # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
 # generated LaTeX document. The header should contain everything until the first
@@ -1735,7 +1735,7 @@ EXTRA_PACKAGES         =
 # to HTML_HEADER.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_HEADER           = 
+LATEX_HEADER           =
 
 # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
 # generated LaTeX document. The footer should contain everything after the last
@@ -1746,7 +1746,7 @@ LATEX_HEADER           =
 # Note: Only use a user-defined footer if you know what you are doing!
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_FOOTER           = 
+LATEX_FOOTER           =
 
 # The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # LaTeX style sheets that are included after the standard style sheets created
@@ -1757,7 +1757,7 @@ LATEX_FOOTER           =
 # list).
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_EXTRA_STYLESHEET = 
+LATEX_EXTRA_STYLESHEET =
 
 # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the LATEX_OUTPUT output
@@ -1765,7 +1765,7 @@ LATEX_EXTRA_STYLESHEET =
 # markers available.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_EXTRA_FILES      = 
+LATEX_EXTRA_FILES      =
 
 # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
 # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
@@ -1865,14 +1865,14 @@ RTF_HYPERLINKS         = NO
 # default style sheet that doxygen normally uses.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
-RTF_STYLESHEET_FILE    = 
+RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
 # similar to doxygen's config file. A template extensions file can be generated
 # using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
-RTF_EXTENSIONS_FILE    = 
+RTF_EXTENSIONS_FILE    =
 
 # If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
 # with syntax highlighting in the RTF output.
@@ -1917,7 +1917,7 @@ MAN_EXTENSION          = .3
 # MAN_EXTENSION with the initial . removed.
 # This tag requires that the tag GENERATE_MAN is set to YES.
 
-MAN_SUBDIR             = 
+MAN_SUBDIR             =
 
 # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
 # will generate one additional man file for each entity documented in the real
@@ -1936,7 +1936,7 @@ MAN_LINKS              = NO
 # captures the structure of the code including all documentation.
 # The default value is: NO.
 
-GENERATE_XML           = YES 
+GENERATE_XML           = YES
 
 # The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
 # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
@@ -2030,7 +2030,7 @@ PERLMOD_PRETTY         = YES
 # overwrite each other's variables.
 # This tag requires that the tag GENERATE_PERLMOD is set to YES.
 
-PERLMOD_MAKEVAR_PREFIX = 
+PERLMOD_MAKEVAR_PREFIX =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the preprocessor
@@ -2071,7 +2071,7 @@ SEARCH_INCLUDES        = YES
 # preprocessor.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
-INCLUDE_PATH           = 
+INCLUDE_PATH           =
 
 # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
 # patterns (like *.h and *.hpp) to filter out the header-files in the
@@ -2079,7 +2079,7 @@ INCLUDE_PATH           =
 # used.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-INCLUDE_FILE_PATTERNS  = 
+INCLUDE_FILE_PATTERNS  =
 
 # The PREDEFINED tag can be used to specify one or more macro names that are
 # defined before the preprocessor is started (similar to the -D option of e.g.
@@ -2089,7 +2089,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = 
+PREDEFINED             =
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
@@ -2098,7 +2098,7 @@ PREDEFINED             =
 # definition found in the source code.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-EXPAND_AS_DEFINED      = 
+EXPAND_AS_DEFINED      =
 
 # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
 # remove all references to function-like macros that are alone on a line, have
@@ -2127,13 +2127,13 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.
 
-TAGFILES               = 
+TAGFILES               =
 
 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to
 # external documentation" for more information about the usage of tag files.
 
-GENERATE_TAGFILE       = 
+GENERATE_TAGFILE       =
 
 # If the ALLEXTERNALS tag is set to YES, all external class will be listed in
 # the class index. If set to NO, only the inherited external classes will be
@@ -2182,14 +2182,14 @@ CLASS_DIAGRAMS         = NO
 # the mscgen tool resides. If left empty the tool is assumed to be found in the
 # default search path.
 
-MSCGEN_PATH            = 
+MSCGEN_PATH            =
 
 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
 # If left empty dia is assumed to be found in the default search path.
 
-DIA_PATH               = 
+DIA_PATH               =
 
 # If set to YES the inheritance and collaboration graphs will hide inheritance
 # and usage relations if the target is undocumented or is not a class.
@@ -2238,7 +2238,7 @@ DOT_FONTSIZE           = 10
 # the path where dot can find it using this tag.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_FONTPATH           = 
+DOT_FONTPATH           =
 
 # If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
 # each documented class showing the direct and indirect inheritance relations.
@@ -2382,26 +2382,26 @@ INTERACTIVE_SVG        = NO
 # found. If left blank, it is assumed the dot tool can be found in the path.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_PATH               = 
+DOT_PATH               =
 
 # The DOTFILE_DIRS tag can be used to specify one or more directories that
 # contain dot files that are included in the documentation (see the \dotfile
 # command).
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOTFILE_DIRS           = 
+DOTFILE_DIRS           =
 
 # The MSCFILE_DIRS tag can be used to specify one or more directories that
 # contain msc files that are included in the documentation (see the \mscfile
 # command).
 
-MSCFILE_DIRS           = 
+MSCFILE_DIRS           =
 
 # The DIAFILE_DIRS tag can be used to specify one or more directories that
 # contain dia files that are included in the documentation (see the \diafile
 # command).
 
-DIAFILE_DIRS           = 
+DIAFILE_DIRS           =
 
 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
 # path where java can find the plantuml.jar file. If left blank, it is assumed
@@ -2409,12 +2409,12 @@ DIAFILE_DIRS           =
 # generate a warning when it encounters a \startuml command in this case and
 # will not generate output for the diagram.
 
-PLANTUML_JAR_PATH      = 
+PLANTUML_JAR_PATH      =
 
 # When using plantuml, the specified paths are searched for files specified by
 # the !include statement in a plantuml block.
 
-PLANTUML_INCLUDE_PATH  = 
+PLANTUML_INCLUDE_PATH  =
 
 # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
 # that will be shown in the graph. If the number of nodes in a graph becomes
diff --git a/ROCm_Libraries/rocALUTION/src/base/base_matrix.hpp b/ROCm_Libraries/rocALUTION/src/base/base_matrix.hpp
index 8f3506fc..022a3195 100644
--- a/ROCm_Libraries/rocALUTION/src/base/base_matrix.hpp
+++ b/ROCm_Libraries/rocALUTION/src/base/base_matrix.hpp
@@ -394,7 +394,7 @@ class BaseMatrix
                                 BaseMatrix<ValueType>* prolong,
                                 BaseMatrix<ValueType>* restrict) const;
 
-    /// Ruge Stüben coarsening
+    /// Ruge Stuben coarsening
     virtual bool RugeStueben(ValueType eps,
                              BaseMatrix<ValueType>* prolong,
                              BaseMatrix<ValueType>* restrict) const;
diff --git a/ROCm_Libraries/rocALUTION/src/base/host/CMakeLists.txt b/ROCm_Libraries/rocALUTION/src/base/host/CMakeLists.txt
index 88780086..ba13da22 100644
--- a/ROCm_Libraries/rocALUTION/src/base/host/CMakeLists.txt
+++ b/ROCm_Libraries/rocALUTION/src/base/host/CMakeLists.txt
@@ -31,7 +31,7 @@ set(HOST_SOURCES
   base/host/host_matrix_hyb.cpp
   base/host/host_matrix_dense.cpp
   base/host/host_vector.cpp
-  base/host/host_conversion.cpp  
+  base/host/host_conversion.cpp
   base/host/host_affinity.cpp
   base/host/host_io.cpp
   base/host/host_stencil_laplace2d.cpp
diff --git a/ROCm_Libraries/rocALUTION/src/base/host/host_matrix_csr.cpp b/ROCm_Libraries/rocALUTION/src/base/host/host_matrix_csr.cpp
index 2de7f288..692070be 100644
--- a/ROCm_Libraries/rocALUTION/src/base/host/host_matrix_csr.cpp
+++ b/ROCm_Libraries/rocALUTION/src/base/host/host_matrix_csr.cpp
@@ -4308,7 +4308,7 @@ bool HostMatrixCSR<ValueType>::RugeStueben(ValueType eps,
     set_to_zero_host(this->nrow_ + 1, S_row_offset);
     set_to_zero_host(this->nnz_, S_val);
 
-// Determine strong influences in matrix (Ruge Stüben approach)
+// Determine strong influences in matrix (Ruge Stuben approach)
 #ifdef _OPENMP
 #pragma omp parallel for schedule(dynamic, 1024)
 #endif
diff --git a/ROCm_Libraries/rocALUTION/src/solvers/multigrid/ruge_stueben_amg.cpp b/ROCm_Libraries/rocALUTION/src/solvers/multigrid/ruge_stueben_amg.cpp
index 973af6da..33064b58 100644
--- a/ROCm_Libraries/rocALUTION/src/solvers/multigrid/ruge_stueben_amg.cpp
+++ b/ROCm_Libraries/rocALUTION/src/solvers/multigrid/ruge_stueben_amg.cpp
@@ -56,7 +56,7 @@ void RugeStuebenAMG<OperatorType, VectorType, ValueType>::Print(void) const
 {
     LOG_INFO("AMG solver");
     LOG_INFO("AMG number of levels " << this->levels_);
-    LOG_INFO("AMG using Ruge-Stüben coarsening");
+    LOG_INFO("AMG using Ruge-Stuben coarsening");
     LOG_INFO("AMG coarsest operator size = " << this->op_level_[this->levels_ - 2]->GetM());
     LOG_INFO("AMG coarsest level nnz = " << this->op_level_[this->levels_ - 2]->GetNnz());
     LOG_INFO("AMG with smoother:");
@@ -70,7 +70,7 @@ void RugeStuebenAMG<OperatorType, VectorType, ValueType>::PrintStart_(void) cons
 
     LOG_INFO("AMG solver starts");
     LOG_INFO("AMG number of levels " << this->levels_);
-    LOG_INFO("AMG using Ruge-Stüben coarsening");
+    LOG_INFO("AMG using Ruge-Stuben coarsening");
     LOG_INFO("AMG coarsest operator size = " << this->op_level_[this->levels_ - 2]->GetM());
     LOG_INFO("AMG coarsest level nnz = " << this->op_level_[this->levels_ - 2]->GetNnz());
     LOG_INFO("AMG with smoother:");
diff --git a/ROCm_Libraries/rocALUTION/src/solvers/preconditioners/preconditioner.hpp b/ROCm_Libraries/rocALUTION/src/solvers/preconditioners/preconditioner.hpp
index eec8a31f..eeb4e2d6 100644
--- a/ROCm_Libraries/rocALUTION/src/solvers/preconditioners/preconditioner.hpp
+++ b/ROCm_Libraries/rocALUTION/src/solvers/preconditioners/preconditioner.hpp
@@ -291,7 +291,7 @@ class IC : public Preconditioner<OperatorType, VectorType, ValueType>
   * \details
   * The Variable Preconditioner can hold a selection of preconditioners. Thus, any type
   * of preconditioners can be combined. As example, the variable preconditioner can
-  * combine Jacobi, GS and ILU – then, the first iteration of the iterative solver will
+  * combine Jacobi, GS and ILU - then, the first iteration of the iterative solver will
   * apply Jacobi, the second iteration will apply GS and the third iteration will apply
   * ILU. After that, the solver will start again with Jacobi, GS, ILU.
   *
diff --git a/ROCm_Libraries/rocALUTION/src/solvers/preconditioners/preconditioner_ai.hpp b/ROCm_Libraries/rocALUTION/src/solvers/preconditioners/preconditioner_ai.hpp
index b4bda8f5..f924aae9 100644
--- a/ROCm_Libraries/rocALUTION/src/solvers/preconditioners/preconditioner_ai.hpp
+++ b/ROCm_Libraries/rocALUTION/src/solvers/preconditioners/preconditioner_ai.hpp
@@ -72,7 +72,7 @@ class AIChebyshev : public Preconditioner<OperatorType, VectorType, ValueType>
   * \brief Factorized Approximate Inverse Preconditioner
   * \details
   * The Factorized Sparse Approximate Inverse preconditioner computes a direct
-  * approximation of \f$M^{-1}\f$ by minimizing the Frobenius norm \f$||I − GL||_{F}\f$,
+  * approximation of \f$M^{-1}\f$ by minimizing the Frobenius norm \f$||I - GL||_{F}\f$,
   * where \f$L\f$ denotes the exact lower triangular part of \f$A\f$ and \f$G:=M^{-1}\f$.
   * The FSAI preconditioner is initialized by \f$q\f$, based on the sparsity pattern of
   * \f$|A^{q}|\f$. However, it is also possible to supply external sparsity patterns in form
@@ -134,7 +134,7 @@ class FSAI : public Preconditioner<OperatorType, VectorType, ValueType>
   * The SParse Approximate Inverse algorithm is an explicitly computed preconditioner for
   * general sparse linear systems. In its current implementation, only the sparsity
   * pattern of the system matrix is supported. The SPAI computation is based on the
-  * minimization of the Frobenius norm \f$||AM − I||_{F}\f$.
+  * minimization of the Frobenius norm \f$||AM - I||_{F}\f$.
   * \cite grote
   *
   * \tparam OperatorType - can be LocalMatrix
diff --git a/ROCm_Libraries/rocBLAS/Doxyfile b/ROCm_Libraries/rocBLAS/Doxyfile
index 196cfa0e..1cd2a76e 100644
--- a/ROCm_Libraries/rocBLAS/Doxyfile
+++ b/ROCm_Libraries/rocBLAS/Doxyfile
@@ -164,7 +164,7 @@ FULL_PATH_NAMES        = YES
 # will be relative from the directory where doxygen is started.
 # This tag requires that the tag FULL_PATH_NAMES is set to YES.
 
-STRIP_FROM_PATH        = 
+STRIP_FROM_PATH        =
 
 # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
 # path mentioned in the documentation of a class, which tells the reader which
@@ -173,7 +173,7 @@ STRIP_FROM_PATH        =
 # specify the list of include paths that are normally passed to the compiler
 # using the -I flag.
 
-STRIP_FROM_INC_PATH    = 
+STRIP_FROM_INC_PATH    =
 
 # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
 # less readable) file names. This can be useful is your file systems doesn't
@@ -240,13 +240,13 @@ TAB_SIZE               = 4
 # "Side Effects:". You can put \n's in the value part of an alias to insert
 # newlines.
 
-ALIASES                = 
+ALIASES                =
 
 # This tag can be used to specify a number of word-keyword mappings (TCL only).
 # A mapping has the form "name=value". For example adding "class=itcl::class"
 # will allow you to use the command class in the itcl::class meaning.
 
-TCL_SUBST              = 
+TCL_SUBST              =
 
 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
@@ -295,7 +295,7 @@ OPTIMIZE_OUTPUT_VHDL   = NO
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
 # the files are not read by doxygen.
 
-EXTENSION_MAPPING      = 
+EXTENSION_MAPPING      =
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
@@ -649,7 +649,7 @@ GENERATE_DEPRECATEDLIST= YES
 # sections, marked by \if <section_label> ... \endif and \cond <section_label>
 # ... \endcond blocks.
 
-ENABLED_SECTIONS       = 
+ENABLED_SECTIONS       =
 
 # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
 # initial value of a variable or macro / define can have for it to appear in the
@@ -691,7 +691,7 @@ SHOW_NAMESPACES        = YES
 # by doxygen. Whatever the program writes to standard output is used as the file
 # version. For an example see the documentation.
 
-FILE_VERSION_FILTER    = 
+FILE_VERSION_FILTER    =
 
 # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
 # by doxygen. The layout file controls the global structure of the generated
@@ -704,7 +704,7 @@ FILE_VERSION_FILTER    =
 # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
 # tag is left empty.
 
-LAYOUT_FILE            = 
+LAYOUT_FILE            =
 
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
@@ -773,7 +773,7 @@ WARN_FORMAT            = "$file:$line: $text"
 # messages should be written. If left blank the output is written to standard
 # error (stderr).
 
-WARN_LOGFILE           = 
+WARN_LOGFILE           =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the input files
@@ -786,7 +786,7 @@ WARN_LOGFILE           =
 # Note: If this tag is empty the current directory is searched.
 
 INPUT                  = ROCm_Libraries/rocBLAS/src/ \
-                         ROCm_Libraries/rocBLAS/src/src/ 
+                         ROCm_Libraries/rocBLAS/src/src/
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -867,7 +867,7 @@ RECURSIVE              = NO
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                = 
+EXCLUDE                =
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -883,7 +883,7 @@ EXCLUDE_SYMLINKS       = NO
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*
 
-EXCLUDE_PATTERNS       = 
+EXCLUDE_PATTERNS       =
 
 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
@@ -894,13 +894,13 @@ EXCLUDE_PATTERNS       =
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories use the pattern */test/*
 
-EXCLUDE_SYMBOLS        = 
+EXCLUDE_SYMBOLS        =
 
 # The EXAMPLE_PATH tag can be used to specify one or more files or directories
 # that contain example code fragments that are included (see the \include
 # command).
 
-EXAMPLE_PATH           = 
+EXAMPLE_PATH           =
 
 # If the value of the EXAMPLE_PATH tag contains directories, you can use the
 # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
@@ -920,7 +920,7 @@ EXAMPLE_RECURSIVE      = NO
 # that contain images that are to be included in the documentation (see the
 # \image command).
 
-IMAGE_PATH             = 
+IMAGE_PATH             =
 
 # The INPUT_FILTER tag can be used to specify a program that doxygen should
 # invoke to filter for each input file. Doxygen will invoke the filter program
@@ -937,7 +937,7 @@ IMAGE_PATH             =
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
 
-INPUT_FILTER           = 
+INPUT_FILTER           =
 
 # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
 # basis. Doxygen will compare the file name with each pattern and apply the
@@ -946,7 +946,7 @@ INPUT_FILTER           =
 # filters are used. If the FILTER_PATTERNS tag is empty or if none of the
 # patterns match the file name, INPUT_FILTER is applied.
 
-FILTER_PATTERNS        = 
+FILTER_PATTERNS        =
 
 # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
 # INPUT_FILTER) will also be used to filter the input files that are used for
@@ -961,7 +961,7 @@ FILTER_SOURCE_FILES    = NO
 # *.ext= (so without naming a filter).
 # This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
 
-FILTER_SOURCE_PATTERNS = 
+FILTER_SOURCE_PATTERNS =
 
 # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
 # is part of the input, its contents will be placed on the main page
@@ -1073,7 +1073,7 @@ CLANG_ASSISTED_PARSING = NO
 # specified with INPUT and INCLUDE_PATH.
 # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
 
-CLANG_OPTIONS          = 
+CLANG_OPTIONS          =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
@@ -1099,7 +1099,7 @@ COLS_IN_ALPHA_INDEX    = 5
 # while generating the index headers.
 # This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
 
-IGNORE_PREFIX          = 
+IGNORE_PREFIX          =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the HTML output
@@ -1144,7 +1144,7 @@ HTML_FILE_EXTENSION    = .html
 # of the possible markers and block names see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_HEADER            = 
+HTML_HEADER            =
 
 # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
 # generated HTML page. If the tag is left blank doxygen will generate a standard
@@ -1154,7 +1154,7 @@ HTML_HEADER            =
 # that doxygen normally uses.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_FOOTER            = 
+HTML_FOOTER            =
 
 # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
 # sheet that is used by each HTML page. It can be used to fine-tune the look of
@@ -1166,7 +1166,7 @@ HTML_FOOTER            =
 # obsolete.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_STYLESHEET        = 
+HTML_STYLESHEET        =
 
 # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # cascading style sheets that are included after the standard style sheets
@@ -1179,7 +1179,7 @@ HTML_STYLESHEET        =
 # list). For an example see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_EXTRA_STYLESHEET  = 
+HTML_EXTRA_STYLESHEET  =
 
 # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the HTML output directory. Note
@@ -1189,7 +1189,7 @@ HTML_EXTRA_STYLESHEET  =
 # files will be copied as-is; there are no commands or markers available.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_EXTRA_FILES       = 
+HTML_EXTRA_FILES       =
 
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
@@ -1318,7 +1318,7 @@ GENERATE_HTMLHELP      = NO
 # written to the html output directory.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-CHM_FILE               = 
+CHM_FILE               =
 
 # The HHC_LOCATION tag can be used to specify the location (absolute path
 # including file name) of the HTML help compiler (hhc.exe). If non-empty,
@@ -1326,7 +1326,7 @@ CHM_FILE               =
 # The file has to be specified with full path.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-HHC_LOCATION           = 
+HHC_LOCATION           =
 
 # The GENERATE_CHI flag controls if a separate .chi index file is generated
 # (YES) or that it should be included in the master .chm file (NO).
@@ -1339,7 +1339,7 @@ GENERATE_CHI           = NO
 # and project file content.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-CHM_INDEX_ENCODING     = 
+CHM_INDEX_ENCODING     =
 
 # The BINARY_TOC flag controls whether a binary table of contents is generated
 # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
@@ -1370,7 +1370,7 @@ GENERATE_QHP           = NO
 # the HTML output folder.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QCH_FILE               = 
+QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
@@ -1395,7 +1395,7 @@ QHP_VIRTUAL_FOLDER     = doc
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_CUST_FILTER_NAME   = 
+QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
@@ -1403,21 +1403,21 @@ QHP_CUST_FILTER_NAME   =
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_CUST_FILTER_ATTRS  = 
+QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
 # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_SECT_FILTER_ATTRS  = 
+QHP_SECT_FILTER_ATTRS  =
 
 # The QHG_LOCATION tag can be used to specify the location of Qt's
 # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
 # generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHG_LOCATION           = 
+QHG_LOCATION           =
 
 # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
 # generated, together with the HTML files, they form an Eclipse help plugin. To
@@ -1550,7 +1550,7 @@ MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
 # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_EXTENSIONS     = 
+MATHJAX_EXTENSIONS     =
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
@@ -1558,7 +1558,7 @@ MATHJAX_EXTENSIONS     =
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_CODEFILE       = 
+MATHJAX_CODEFILE       =
 
 # When the SEARCHENGINE tag is enabled doxygen will generate a search box for
 # the HTML output. The underlying search engine uses javascript and DHTML and
@@ -1618,7 +1618,7 @@ EXTERNAL_SEARCH        = NO
 # Searching" for details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-SEARCHENGINE_URL       = 
+SEARCHENGINE_URL       =
 
 # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
 # search data is written to a file for indexing by an external tool. With the
@@ -1634,7 +1634,7 @@ SEARCHDATA_FILE        = searchdata.xml
 # projects and redirect the results back to the right project.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-EXTERNAL_SEARCH_ID     = 
+EXTERNAL_SEARCH_ID     =
 
 # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
 # projects other than the one defined by this configuration file, but that are
@@ -1644,7 +1644,7 @@ EXTERNAL_SEARCH_ID     =
 # EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-EXTRA_SEARCH_MAPPINGS  = 
+EXTRA_SEARCH_MAPPINGS  =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the LaTeX output
@@ -1708,7 +1708,7 @@ PAPER_TYPE             = a4
 # If left blank no extra packages will be included.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-EXTRA_PACKAGES         = 
+EXTRA_PACKAGES         =
 
 # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
 # generated LaTeX document. The header should contain everything until the first
@@ -1724,7 +1724,7 @@ EXTRA_PACKAGES         =
 # to HTML_HEADER.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_HEADER           = 
+LATEX_HEADER           =
 
 # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
 # generated LaTeX document. The footer should contain everything after the last
@@ -1735,7 +1735,7 @@ LATEX_HEADER           =
 # Note: Only use a user-defined footer if you know what you are doing!
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_FOOTER           = 
+LATEX_FOOTER           =
 
 # The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # LaTeX style sheets that are included after the standard style sheets created
@@ -1746,7 +1746,7 @@ LATEX_FOOTER           =
 # list).
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_EXTRA_STYLESHEET = 
+LATEX_EXTRA_STYLESHEET =
 
 # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the LATEX_OUTPUT output
@@ -1754,7 +1754,7 @@ LATEX_EXTRA_STYLESHEET =
 # markers available.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_EXTRA_FILES      = 
+LATEX_EXTRA_FILES      =
 
 # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
 # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
@@ -1854,14 +1854,14 @@ RTF_HYPERLINKS         = NO
 # default style sheet that doxygen normally uses.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
-RTF_STYLESHEET_FILE    = 
+RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
 # similar to doxygen's config file. A template extensions file can be generated
 # using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
-RTF_EXTENSIONS_FILE    = 
+RTF_EXTENSIONS_FILE    =
 
 # If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
 # with syntax highlighting in the RTF output.
@@ -1906,7 +1906,7 @@ MAN_EXTENSION          = .3
 # MAN_EXTENSION with the initial . removed.
 # This tag requires that the tag GENERATE_MAN is set to YES.
 
-MAN_SUBDIR             = 
+MAN_SUBDIR             =
 
 # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
 # will generate one additional man file for each entity documented in the real
@@ -2018,7 +2018,7 @@ PERLMOD_PRETTY         = YES
 # overwrite each other's variables.
 # This tag requires that the tag GENERATE_PERLMOD is set to YES.
 
-PERLMOD_MAKEVAR_PREFIX = 
+PERLMOD_MAKEVAR_PREFIX =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the preprocessor
@@ -2059,7 +2059,7 @@ SEARCH_INCLUDES        = YES
 # preprocessor.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
-INCLUDE_PATH           = 
+INCLUDE_PATH           =
 
 # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
 # patterns (like *.h and *.hpp) to filter out the header-files in the
@@ -2067,7 +2067,7 @@ INCLUDE_PATH           =
 # used.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-INCLUDE_FILE_PATTERNS  = 
+INCLUDE_FILE_PATTERNS  =
 
 # The PREDEFINED tag can be used to specify one or more macro names that are
 # defined before the preprocessor is started (similar to the -D option of e.g.
@@ -2077,7 +2077,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = 
+PREDEFINED             =
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
@@ -2086,7 +2086,7 @@ PREDEFINED             =
 # definition found in the source code.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-EXPAND_AS_DEFINED      = 
+EXPAND_AS_DEFINED      =
 
 # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
 # remove all references to function-like macros that are alone on a line, have
@@ -2115,13 +2115,13 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.
 
-TAGFILES               = 
+TAGFILES               =
 
 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to
 # external documentation" for more information about the usage of tag files.
 
-GENERATE_TAGFILE       = 
+GENERATE_TAGFILE       =
 
 # If the ALLEXTERNALS tag is set to YES, all external class will be listed in
 # the class index. If set to NO, only the inherited external classes will be
@@ -2170,14 +2170,14 @@ CLASS_DIAGRAMS         = NO
 # the mscgen tool resides. If left empty the tool is assumed to be found in the
 # default search path.
 
-MSCGEN_PATH            = 
+MSCGEN_PATH            =
 
 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
 # If left empty dia is assumed to be found in the default search path.
 
-DIA_PATH               = 
+DIA_PATH               =
 
 # If set to YES the inheritance and collaboration graphs will hide inheritance
 # and usage relations if the target is undocumented or is not a class.
@@ -2226,7 +2226,7 @@ DOT_FONTSIZE           = 10
 # the path where dot can find it using this tag.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_FONTPATH           = 
+DOT_FONTPATH           =
 
 # If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
 # each documented class showing the direct and indirect inheritance relations.
@@ -2370,26 +2370,26 @@ INTERACTIVE_SVG        = NO
 # found. If left blank, it is assumed the dot tool can be found in the path.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_PATH               = 
+DOT_PATH               =
 
 # The DOTFILE_DIRS tag can be used to specify one or more directories that
 # contain dot files that are included in the documentation (see the \dotfile
 # command).
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOTFILE_DIRS           = 
+DOTFILE_DIRS           =
 
 # The MSCFILE_DIRS tag can be used to specify one or more directories that
 # contain msc files that are included in the documentation (see the \mscfile
 # command).
 
-MSCFILE_DIRS           = 
+MSCFILE_DIRS           =
 
 # The DIAFILE_DIRS tag can be used to specify one or more directories that
 # contain dia files that are included in the documentation (see the \diafile
 # command).
 
-DIAFILE_DIRS           = 
+DIAFILE_DIRS           =
 
 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
 # path where java can find the plantuml.jar file. If left blank, it is assumed
@@ -2397,12 +2397,12 @@ DIAFILE_DIRS           =
 # generate a warning when it encounters a \startuml command in this case and
 # will not generate output for the diagram.
 
-PLANTUML_JAR_PATH      = 
+PLANTUML_JAR_PATH      =
 
 # When using plantuml, the specified paths are searched for files specified by
 # the !include statement in a plantuml block.
 
-PLANTUML_INCLUDE_PATH  = 
+PLANTUML_INCLUDE_PATH  =
 
 # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
 # that will be shown in the graph. If the number of nodes in a graph becomes
diff --git a/ROCm_Libraries/rocBLAS/src/include/rocblas-functions.h b/ROCm_Libraries/rocBLAS/src/include/rocblas-functions.h
index 954f6136..a4245df5 100644
--- a/ROCm_Libraries/rocBLAS/src/include/rocblas-functions.h
+++ b/ROCm_Libraries/rocBLAS/src/include/rocblas-functions.h
@@ -115,7 +115,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_zdscal(rocblas_handle          handle,
 
 /*! \brief BLAS Level 1 API
      \details
-    scal_batched  scales each element of vector x_i with scalar alpha, for i = 1, … , batch_count.
+    scal_batched  scales each element of vector x_i with scalar alpha, for i = 1, ... , batch_count.
 
          x_i := alpha * x_i
 
@@ -182,7 +182,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_zdscal_batched(rocblas_handle
 
 /*! \brief BLAS Level 1 API
      \details
-    scal_strided_batched  scales each element of vector x_i with scalar alpha, for i = 1, … , batch_count.
+    scal_strided_batched  scales each element of vector x_i with scalar alpha, for i = 1, ... , batch_count.
 
          x_i := alpha * x_i ,
 
@@ -262,7 +262,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_zdscal_strided_batched(rocblas_handle
 /*! \brief BLAS Level 1 API
 
     \details
-    copy  copies each element x[i] into y[i], for  i = 1 , … , n
+    copy  copies each element x[i] into y[i], for  i = 1 , ... , n
 
         y := x,
 
@@ -316,7 +316,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_zcopy(rocblas_handle                handle
 /*! \brief BLAS Level 1 API
 
     \details
-    copy_batched copies each element x_i[j] into y_i[j], for  j = 1 , … , n; i = 1 , … , batch_count
+    copy_batched copies each element x_i[j] into y_i[j], for  j = 1 , ... , n; i = 1 , ... , batch_count
 
         y_i := x_i,
 
@@ -380,7 +380,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_zcopy_batched(rocblas_handle
 /*! \brief BLAS Level 1 API
 
     \details
-    copy_strided_batched copies each element x_i[j] into y_i[j], for  j = 1 , … , n; i = 1 , … , batch_count
+    copy_strided_batched copies each element x_i[j] into y_i[j], for  j = 1 , ... , n; i = 1 , ... , batch_count
 
         y_i := x_i,
 
@@ -4561,7 +4561,7 @@ rocblas_zsyr(rocblas_handle handle,
         A[i] := A[i] + alpha*x[i]*x[i]**T
 
     where alpha is a scalar, x is an array of vectors, and A is an array of
-    n by n symmetric matrices, for i = 1 , … , batch_count
+    n by n symmetric matrices, for i = 1 , ... , batch_count
 
     @param[in]
     handle    [rocblas_handle]
@@ -4621,7 +4621,7 @@ ROCBLAS_EXPORT rocblas_status rocblas_dsyr_batched(rocblas_handle      handle,
         A[i] := A[i] + alpha*x[i]*x[i]**T
 
     where alpha is a scalar, vectors, and A is an array of
-    n by n symmetric matrices, for i = 1 , … , batch_count
+    n by n symmetric matrices, for i = 1 , ... , batch_count
 
     @param[in]
     handle    [rocblas_handle]
diff --git a/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_copy.cpp b/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_copy.cpp
index cc108c2d..cecedfba 100644
--- a/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_copy.cpp
+++ b/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_copy.cpp
@@ -38,7 +38,7 @@ constexpr char rocblas_copy_name<rocblas_double_complex>[] = "rocblas_zcopy";
 /*! \brief BLAS Level 1 API
 
     \details
-    copy  copies the vector x[i] into the vector y[i], for  i = 1 , … , n
+    copy  copies the vector x[i] into the vector y[i], for  i = 1 , ... , n
 
         y := x,
 
diff --git a/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_scal.cpp b/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_scal.cpp
index 7508e7ca..ad880070 100644
--- a/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_scal.cpp
+++ b/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_scal.cpp
@@ -39,7 +39,7 @@ constexpr char rocblas_scal_name<rocblas_double_complex>[] = "rocblas_zscal";
 /*! \brief BLAS Level 1 API
 
     \details
-    scal  scal the vector x[i] with scalar alpha, for  i = 1 , … , n
+    scal  scal the vector x[i] with scalar alpha, for  i = 1 , ... , n
 
         x := alpha * x ,
 
diff --git a/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_swap.cpp b/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_swap.cpp
index 712b2e3d..8dc1f9c6 100644
--- a/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_swap.cpp
+++ b/ROCm_Libraries/rocBLAS/src/src/blas1/rocblas_swap.cpp
@@ -41,7 +41,7 @@ constexpr char rocblas_swap_name<rocblas_double_complex>[] = "rocblas_zswap";
 /*! \brief BLAS Level 1 API
 
     \details
-    swap  interchange vector x[i] and y[i], for  i = 1 , … , n
+    swap  interchange vector x[i] and y[i], for  i = 1 , ... , n
 
         y := x; x := y
 
diff --git a/ROCm_Libraries/rocBLAS/src/src/blas_ex/rocblas_gemm_ex.hpp b/ROCm_Libraries/rocBLAS/src/src/blas_ex/rocblas_gemm_ex.hpp
index 07b33167..11352bf8 100644
--- a/ROCm_Libraries/rocBLAS/src/src/blas_ex/rocblas_gemm_ex.hpp
+++ b/ROCm_Libraries/rocBLAS/src/src/blas_ex/rocblas_gemm_ex.hpp
@@ -296,13 +296,13 @@ rocblas_status gemm_ex_handle_transpose(rocblas_handle handle,
 
     if((trans_a == rocblas_operation_none) && (trans_b == rocblas_operation_none))
     {
-        t_status = tensile_Cijk_Ailk_Bljk_B<Ti,To,Tc>(static_cast<To*>(d), 
-                                                      static_cast<const To*>(c_in), 
-                                                      static_cast<const Ti*>(a), 
+        t_status = tensile_Cijk_Ailk_Bljk_B<Ti,To,Tc>(static_cast<To*>(d),
+                                                      static_cast<const To*>(c_in),
+                                                      static_cast<const Ti*>(a),
                                                       static_cast<const Ti*>(b),
                                                       alpha, beta,
-                                                      static_cast<unsigned int>(ldd), stride_d, 
-                                                      static_cast<unsigned int>(lda), stride_a, 
+                                                      static_cast<unsigned int>(ldd), stride_d,
+                                                      static_cast<unsigned int>(lda), stride_a,
                                                       static_cast<unsigned int>(ldb), stride_b,
                                                       static_cast<unsigned int>(m),
                                                       static_cast<unsigned int>(n),
@@ -313,13 +313,13 @@ rocblas_status gemm_ex_handle_transpose(rocblas_handle handle,
     else if((trans_a == rocblas_operation_none) &&
             (trans_b == rocblas_operation_transpose || trans_b == rocblas_operation_conjugate_transpose))
     {
-        t_status = tensile_Cijk_Ailk_Bjlk_B<Ti,To,Tc>(static_cast<To*>(d), 
-                                                      static_cast<const To*>(c_in), 
-                                                      static_cast<const Ti*>(a), 
+        t_status = tensile_Cijk_Ailk_Bjlk_B<Ti,To,Tc>(static_cast<To*>(d),
+                                                      static_cast<const To*>(c_in),
+                                                      static_cast<const Ti*>(a),
                                                       static_cast<const Ti*>(b),
                                                       alpha, beta,
-                                                      static_cast<unsigned int>(ldd), stride_d, 
-                                                      static_cast<unsigned int>(lda), stride_a, 
+                                                      static_cast<unsigned int>(ldd), stride_d,
+                                                      static_cast<unsigned int>(lda), stride_a,
                                                       static_cast<unsigned int>(ldb), stride_b,
                                                       static_cast<unsigned int>(m),
                                                       static_cast<unsigned int>(n),
@@ -331,12 +331,12 @@ rocblas_status gemm_ex_handle_transpose(rocblas_handle handle,
             (trans_b == rocblas_operation_none))
     {
         t_status = tensile_Cijk_Alik_Bljk_B<Ti,To,Tc>(static_cast<To*>(d),
-                                                      static_cast<const To*>(c_in), 
+                                                      static_cast<const To*>(c_in),
                                                       static_cast<const Ti*>(a),
                                                       static_cast<const Ti*>(b),
                                                       alpha, beta,
-                                                      static_cast<unsigned int>(ldd), stride_d, 
-                                                      static_cast<unsigned int>(lda), stride_a, 
+                                                      static_cast<unsigned int>(ldd), stride_d,
+                                                      static_cast<unsigned int>(lda), stride_a,
                                                       static_cast<unsigned int>(ldb), stride_b,
                                                       static_cast<unsigned int>(m),
                                                       static_cast<unsigned int>(n),
@@ -348,12 +348,12 @@ rocblas_status gemm_ex_handle_transpose(rocblas_handle handle,
             (trans_b == rocblas_operation_transpose || trans_b == rocblas_operation_conjugate_transpose))
     {
         t_status = tensile_Cijk_Alik_Bjlk_B<Ti,To,Tc>(static_cast<To*>(d),
-                                                      static_cast<const To*>(c_in), 
+                                                      static_cast<const To*>(c_in),
                                                       static_cast<const Ti*>(a),
                                                       static_cast<const Ti*>(b),
                                                       alpha, beta,
-                                                      static_cast<unsigned int>(ldd), stride_d, 
-                                                      static_cast<unsigned int>(lda), stride_a, 
+                                                      static_cast<unsigned int>(ldd), stride_d,
+                                                      static_cast<unsigned int>(lda), stride_a,
                                                       static_cast<unsigned int>(ldb), stride_b,
                                                       static_cast<unsigned int>(m),
                                                       static_cast<unsigned int>(n),
diff --git a/ROCm_Libraries/rocBLAS/src/src/buildinfo.cpp b/ROCm_Libraries/rocBLAS/src/src/buildinfo.cpp
index 93e87a70..9a80f7af 100644
--- a/ROCm_Libraries/rocBLAS/src/src/buildinfo.cpp
+++ b/ROCm_Libraries/rocBLAS/src/src/buildinfo.cpp
@@ -1,41 +1,41 @@
-/* ************************************************************************
- * Copyright 2018 Advanced Micro Devices, Inc.
- *
- * ************************************************************************ */
-
-#include <stdio.h>
-#include <sstream>
-#include <string.h>
-#include "definitions.h"
-#include "rocblas-types.h"
-#include "rocblas-functions.h"
-#include "rocblas-version.h"
-
-#define TO_STR2(x) #x
-#define TO_STR(x) TO_STR2(x)
-// clang-format off
-#define VERSION_STRING                 \
-    (TO_STR(ROCBLAS_VERSION_MAJOR) "." \
-     TO_STR(ROCBLAS_VERSION_MINOR) "." \
-     TO_STR(ROCBLAS_VERSION_PATCH) "." \
-     TO_STR(ROCBLAS_VERSION_TWEAK) "-" \
-     TO_STR(ROCBLAS_VERSION_COMMIT_ID))
-// clang-format on
-/*******************************************************************************
- *! \brief   loads char* buf with the rocblas library version. size_t len
-     is the maximum length of char* buf.
- ******************************************************************************/
-extern "C" rocblas_status rocblas_get_version_string(char* buf, size_t len)
-{
-    std::string v(VERSION_STRING);
-    strcpy(buf, v.c_str());
-
-    if(buf == NULL)
-        return rocblas_status_internal_error;
-
-    size_t count = std::min(len - 1, v.length());
-    memcpy(buf, v.c_str(), count);
-    *(buf + count) = '\0';
-
-    return rocblas_status_success;
-}
+/* ************************************************************************
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * ************************************************************************ */
+
+#include <stdio.h>
+#include <sstream>
+#include <string.h>
+#include "definitions.h"
+#include "rocblas-types.h"
+#include "rocblas-functions.h"
+#include "rocblas-version.h"
+
+#define TO_STR2(x) #x
+#define TO_STR(x) TO_STR2(x)
+// clang-format off
+#define VERSION_STRING                 \
+    (TO_STR(ROCBLAS_VERSION_MAJOR) "." \
+     TO_STR(ROCBLAS_VERSION_MINOR) "." \
+     TO_STR(ROCBLAS_VERSION_PATCH) "." \
+     TO_STR(ROCBLAS_VERSION_TWEAK) "-" \
+     TO_STR(ROCBLAS_VERSION_COMMIT_ID))
+// clang-format on
+/*******************************************************************************
+ *! \brief   loads char* buf with the rocblas library version. size_t len
+     is the maximum length of char* buf.
+ ******************************************************************************/
+extern "C" rocblas_status rocblas_get_version_string(char* buf, size_t len)
+{
+    std::string v(VERSION_STRING);
+    strcpy(buf, v.c_str());
+
+    if(buf == NULL)
+        return rocblas_status_internal_error;
+
+    size_t count = std::min(len - 1, v.length());
+    memcpy(buf, v.c_str(), count);
+    *(buf + count) = '\0';
+
+    return rocblas_status_success;
+}
diff --git a/ROCm_Libraries/rocFFT/Doxyfile b/ROCm_Libraries/rocFFT/Doxyfile
index d7cd8a71..ab5ce1b3 100644
--- a/ROCm_Libraries/rocFFT/Doxyfile
+++ b/ROCm_Libraries/rocFFT/Doxyfile
@@ -162,7 +162,7 @@ FULL_PATH_NAMES        = YES
 # will be relative from the directory where doxygen is started.
 # This tag requires that the tag FULL_PATH_NAMES is set to YES.
 
-STRIP_FROM_PATH        = 
+STRIP_FROM_PATH        =
 
 # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
 # path mentioned in the documentation of a class, which tells the reader which
@@ -171,7 +171,7 @@ STRIP_FROM_PATH        =
 # specify the list of include paths that are normally passed to the compiler
 # using the -I flag.
 
-STRIP_FROM_INC_PATH    = 
+STRIP_FROM_INC_PATH    =
 
 # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
 # less readable) file names. This can be useful is your file systems doesn't
@@ -238,13 +238,13 @@ TAB_SIZE               = 4
 # "Side Effects:". You can put \n's in the value part of an alias to insert
 # newlines.
 
-ALIASES                = 
+ALIASES                =
 
 # This tag can be used to specify a number of word-keyword mappings (TCL only).
 # A mapping has the form "name=value". For example adding "class=itcl::class"
 # will allow you to use the command class in the itcl::class meaning.
 
-TCL_SUBST              = 
+TCL_SUBST              =
 
 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
@@ -291,7 +291,7 @@ OPTIMIZE_OUTPUT_VHDL   = NO
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
 # the files are not read by doxygen.
 
-EXTENSION_MAPPING      = 
+EXTENSION_MAPPING      =
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
@@ -641,7 +641,7 @@ GENERATE_DEPRECATEDLIST= YES
 # sections, marked by \if <section_label> ... \endif and \cond <section_label>
 # ... \endcond blocks.
 
-ENABLED_SECTIONS       = 
+ENABLED_SECTIONS       =
 
 # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
 # initial value of a variable or macro / define can have for it to appear in the
@@ -683,7 +683,7 @@ SHOW_NAMESPACES        = YES
 # by doxygen. Whatever the program writes to standard output is used as the file
 # version. For an example see the documentation.
 
-FILE_VERSION_FILTER    = 
+FILE_VERSION_FILTER    =
 
 # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
 # by doxygen. The layout file controls the global structure of the generated
@@ -696,7 +696,7 @@ FILE_VERSION_FILTER    =
 # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
 # tag is left empty.
 
-LAYOUT_FILE            = 
+LAYOUT_FILE            =
 
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
@@ -765,7 +765,7 @@ WARN_FORMAT            = "$file:$line: $text"
 # messages should be written. If left blank the output is written to standard
 # error (stderr).
 
-WARN_LOGFILE           = 
+WARN_LOGFILE           =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the input files
@@ -858,7 +858,7 @@ RECURSIVE              = NO
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                = 
+EXCLUDE                =
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -874,7 +874,7 @@ EXCLUDE_SYMLINKS       = NO
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*
 
-EXCLUDE_PATTERNS       = 
+EXCLUDE_PATTERNS       =
 
 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
@@ -885,13 +885,13 @@ EXCLUDE_PATTERNS       =
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories use the pattern */test/*
 
-EXCLUDE_SYMBOLS        = 
+EXCLUDE_SYMBOLS        =
 
 # The EXAMPLE_PATH tag can be used to specify one or more files or directories
 # that contain example code fragments that are included (see the \include
 # command).
 
-EXAMPLE_PATH           = 
+EXAMPLE_PATH           =
 
 # If the value of the EXAMPLE_PATH tag contains directories, you can use the
 # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
@@ -911,7 +911,7 @@ EXAMPLE_RECURSIVE      = NO
 # that contain images that are to be included in the documentation (see the
 # \image command).
 
-IMAGE_PATH             = 
+IMAGE_PATH             =
 
 # The INPUT_FILTER tag can be used to specify a program that doxygen should
 # invoke to filter for each input file. Doxygen will invoke the filter program
@@ -928,7 +928,7 @@ IMAGE_PATH             =
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
 
-INPUT_FILTER           = 
+INPUT_FILTER           =
 
 # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
 # basis. Doxygen will compare the file name with each pattern and apply the
@@ -937,7 +937,7 @@ INPUT_FILTER           =
 # filters are used. If the FILTER_PATTERNS tag is empty or if none of the
 # patterns match the file name, INPUT_FILTER is applied.
 
-FILTER_PATTERNS        = 
+FILTER_PATTERNS        =
 
 # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
 # INPUT_FILTER) will also be used to filter the input files that are used for
@@ -952,7 +952,7 @@ FILTER_SOURCE_FILES    = NO
 # *.ext= (so without naming a filter).
 # This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
 
-FILTER_SOURCE_PATTERNS = 
+FILTER_SOURCE_PATTERNS =
 
 # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
 # is part of the input, its contents will be placed on the main page
@@ -1064,7 +1064,7 @@ CLANG_ASSISTED_PARSING = NO
 # specified with INPUT and INCLUDE_PATH.
 # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
 
-CLANG_OPTIONS          = 
+CLANG_OPTIONS          =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
@@ -1090,7 +1090,7 @@ COLS_IN_ALPHA_INDEX    = 5
 # while generating the index headers.
 # This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
 
-IGNORE_PREFIX          = 
+IGNORE_PREFIX          =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the HTML output
@@ -1136,7 +1136,7 @@ HTML_FILE_EXTENSION    = .html
 # of the possible markers and block names see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_HEADER            = 
+HTML_HEADER            =
 
 # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
 # generated HTML page. If the tag is left blank doxygen will generate a standard
@@ -1146,7 +1146,7 @@ HTML_HEADER            =
 # that doxygen normally uses.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_FOOTER            = 
+HTML_FOOTER            =
 
 # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
 # sheet that is used by each HTML page. It can be used to fine-tune the look of
@@ -1158,7 +1158,7 @@ HTML_FOOTER            =
 # obsolete.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_STYLESHEET        = 
+HTML_STYLESHEET        =
 
 # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # cascading style sheets that are included after the standard style sheets
@@ -1171,7 +1171,7 @@ HTML_STYLESHEET        =
 # list). For an example see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_EXTRA_STYLESHEET  = 
+HTML_EXTRA_STYLESHEET  =
 
 # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the HTML output directory. Note
@@ -1181,7 +1181,7 @@ HTML_EXTRA_STYLESHEET  =
 # files will be copied as-is; there are no commands or markers available.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_EXTRA_FILES       = 
+HTML_EXTRA_FILES       =
 
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
@@ -1310,7 +1310,7 @@ GENERATE_HTMLHELP      = NO
 # written to the html output directory.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-CHM_FILE               = 
+CHM_FILE               =
 
 # The HHC_LOCATION tag can be used to specify the location (absolute path
 # including file name) of the HTML help compiler (hhc.exe). If non-empty,
@@ -1318,7 +1318,7 @@ CHM_FILE               =
 # The file has to be specified with full path.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-HHC_LOCATION           = 
+HHC_LOCATION           =
 
 # The GENERATE_CHI flag controls if a separate .chi index file is generated
 # (YES) or that it should be included in the master .chm file (NO).
@@ -1331,7 +1331,7 @@ GENERATE_CHI           = NO
 # and project file content.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-CHM_INDEX_ENCODING     = 
+CHM_INDEX_ENCODING     =
 
 # The BINARY_TOC flag controls whether a binary table of contents is generated
 # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
@@ -1362,7 +1362,7 @@ GENERATE_QHP           = NO
 # the HTML output folder.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QCH_FILE               = 
+QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
@@ -1387,7 +1387,7 @@ QHP_VIRTUAL_FOLDER     = doc
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_CUST_FILTER_NAME   = 
+QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
@@ -1395,21 +1395,21 @@ QHP_CUST_FILTER_NAME   =
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_CUST_FILTER_ATTRS  = 
+QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
 # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_SECT_FILTER_ATTRS  = 
+QHP_SECT_FILTER_ATTRS  =
 
 # The QHG_LOCATION tag can be used to specify the location of Qt's
 # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
 # generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHG_LOCATION           = 
+QHG_LOCATION           =
 
 # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
 # generated, together with the HTML files, they form an Eclipse help plugin. To
@@ -1542,7 +1542,7 @@ MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
 # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_EXTENSIONS     = 
+MATHJAX_EXTENSIONS     =
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
@@ -1550,7 +1550,7 @@ MATHJAX_EXTENSIONS     =
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_CODEFILE       = 
+MATHJAX_CODEFILE       =
 
 # When the SEARCHENGINE tag is enabled doxygen will generate a search box for
 # the HTML output. The underlying search engine uses javascript and DHTML and
@@ -1610,7 +1610,7 @@ EXTERNAL_SEARCH        = NO
 # Searching" for details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-SEARCHENGINE_URL       = 
+SEARCHENGINE_URL       =
 
 # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
 # search data is written to a file for indexing by an external tool. With the
@@ -1626,7 +1626,7 @@ SEARCHDATA_FILE        = searchdata.xml
 # projects and redirect the results back to the right project.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-EXTERNAL_SEARCH_ID     = 
+EXTERNAL_SEARCH_ID     =
 
 # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
 # projects other than the one defined by this configuration file, but that are
@@ -1636,7 +1636,7 @@ EXTERNAL_SEARCH_ID     =
 # EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-EXTRA_SEARCH_MAPPINGS  = 
+EXTRA_SEARCH_MAPPINGS  =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the LaTeX output
@@ -1700,7 +1700,7 @@ PAPER_TYPE             = a4
 # If left blank no extra packages will be included.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-EXTRA_PACKAGES         = 
+EXTRA_PACKAGES         =
 
 # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
 # generated LaTeX document. The header should contain everything until the first
@@ -1716,7 +1716,7 @@ EXTRA_PACKAGES         =
 # to HTML_HEADER.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_HEADER           = 
+LATEX_HEADER           =
 
 # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
 # generated LaTeX document. The footer should contain everything after the last
@@ -1727,7 +1727,7 @@ LATEX_HEADER           =
 # Note: Only use a user-defined footer if you know what you are doing!
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_FOOTER           = 
+LATEX_FOOTER           =
 
 # The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # LaTeX style sheets that are included after the standard style sheets created
@@ -1738,7 +1738,7 @@ LATEX_FOOTER           =
 # list).
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_EXTRA_STYLESHEET = 
+LATEX_EXTRA_STYLESHEET =
 
 # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the LATEX_OUTPUT output
@@ -1746,7 +1746,7 @@ LATEX_EXTRA_STYLESHEET =
 # markers available.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_EXTRA_FILES      = 
+LATEX_EXTRA_FILES      =
 
 # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
 # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
@@ -1846,14 +1846,14 @@ RTF_HYPERLINKS         = NO
 # default style sheet that doxygen normally uses.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
-RTF_STYLESHEET_FILE    = 
+RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
 # similar to doxygen's config file. A template extensions file can be generated
 # using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
-RTF_EXTENSIONS_FILE    = 
+RTF_EXTENSIONS_FILE    =
 
 # If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
 # with syntax highlighting in the RTF output.
@@ -1898,7 +1898,7 @@ MAN_EXTENSION          = .3
 # MAN_EXTENSION with the initial . removed.
 # This tag requires that the tag GENERATE_MAN is set to YES.
 
-MAN_SUBDIR             = 
+MAN_SUBDIR             =
 
 # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
 # will generate one additional man file for each entity documented in the real
@@ -1917,7 +1917,7 @@ MAN_LINKS              = NO
 # captures the structure of the code including all documentation.
 # The default value is: NO.
 
-GENERATE_XML           = YES 
+GENERATE_XML           = YES
 
 # The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
 # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
@@ -2011,7 +2011,7 @@ PERLMOD_PRETTY         = YES
 # overwrite each other's variables.
 # This tag requires that the tag GENERATE_PERLMOD is set to YES.
 
-PERLMOD_MAKEVAR_PREFIX = 
+PERLMOD_MAKEVAR_PREFIX =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the preprocessor
@@ -2052,7 +2052,7 @@ SEARCH_INCLUDES        = YES
 # preprocessor.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
-INCLUDE_PATH           = 
+INCLUDE_PATH           =
 
 # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
 # patterns (like *.h and *.hpp) to filter out the header-files in the
@@ -2060,7 +2060,7 @@ INCLUDE_PATH           =
 # used.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-INCLUDE_FILE_PATTERNS  = 
+INCLUDE_FILE_PATTERNS  =
 
 # The PREDEFINED tag can be used to specify one or more macro names that are
 # defined before the preprocessor is started (similar to the -D option of e.g.
@@ -2070,7 +2070,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = 
+PREDEFINED             =
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
@@ -2079,7 +2079,7 @@ PREDEFINED             =
 # definition found in the source code.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-EXPAND_AS_DEFINED      = 
+EXPAND_AS_DEFINED      =
 
 # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
 # remove all references to function-like macros that are alone on a line, have
@@ -2108,13 +2108,13 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.
 
-TAGFILES               = 
+TAGFILES               =
 
 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to
 # external documentation" for more information about the usage of tag files.
 
-GENERATE_TAGFILE       = 
+GENERATE_TAGFILE       =
 
 # If the ALLEXTERNALS tag is set to YES, all external class will be listed in
 # the class index. If set to NO, only the inherited external classes will be
@@ -2163,14 +2163,14 @@ CLASS_DIAGRAMS         = NO
 # the mscgen tool resides. If left empty the tool is assumed to be found in the
 # default search path.
 
-MSCGEN_PATH            = 
+MSCGEN_PATH            =
 
 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
 # If left empty dia is assumed to be found in the default search path.
 
-DIA_PATH               = 
+DIA_PATH               =
 
 # If set to YES the inheritance and collaboration graphs will hide inheritance
 # and usage relations if the target is undocumented or is not a class.
@@ -2219,7 +2219,7 @@ DOT_FONTSIZE           = 10
 # the path where dot can find it using this tag.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_FONTPATH           = 
+DOT_FONTPATH           =
 
 # If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
 # each documented class showing the direct and indirect inheritance relations.
@@ -2363,26 +2363,26 @@ INTERACTIVE_SVG        = NO
 # found. If left blank, it is assumed the dot tool can be found in the path.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_PATH               = 
+DOT_PATH               =
 
 # The DOTFILE_DIRS tag can be used to specify one or more directories that
 # contain dot files that are included in the documentation (see the \dotfile
 # command).
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOTFILE_DIRS           = 
+DOTFILE_DIRS           =
 
 # The MSCFILE_DIRS tag can be used to specify one or more directories that
 # contain msc files that are included in the documentation (see the \mscfile
 # command).
 
-MSCFILE_DIRS           = 
+MSCFILE_DIRS           =
 
 # The DIAFILE_DIRS tag can be used to specify one or more directories that
 # contain dia files that are included in the documentation (see the \diafile
 # command).
 
-DIAFILE_DIRS           = 
+DIAFILE_DIRS           =
 
 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
 # path where java can find the plantuml.jar file. If left blank, it is assumed
@@ -2390,12 +2390,12 @@ DIAFILE_DIRS           =
 # generate a warning when it encounters a \startuml command in this case and
 # will not generate output for the diagram.
 
-PLANTUML_JAR_PATH      = 
+PLANTUML_JAR_PATH      =
 
 # When using plantuml, the specified paths are searched for files specified by
 # the !include statement in a plantuml block.
 
-PLANTUML_INCLUDE_PATH  = 
+PLANTUML_INCLUDE_PATH  =
 
 # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
 # that will be shown in the graph. If the number of nodes in a graph becomes
diff --git a/ROCm_Libraries/rocSOLVER/API.rst b/ROCm_Libraries/rocSOLVER/API.rst
index bdfb6ff3..bf80aac6 100644
--- a/ROCm_Libraries/rocSOLVER/API.rst
+++ b/ROCm_Libraries/rocSOLVER/API.rst
@@ -1,12 +1,12 @@
 .. toctree::
-   :maxdepth: 4 
+   :maxdepth: 4
    :caption: Contents:
 
 *************
 rocSOLVER API
 *************
 
-This section provides details of the rocSOLVER library API as in release 
+This section provides details of the rocSOLVER library API as in release
 `ROCm 2.10 <https://github.com/ROCmSoftwarePlatform/rocSOLVER/tree/master-rocm-2.10>`_.
 
 
@@ -14,7 +14,7 @@ This section provides details of the rocSOLVER library API as in release
 Types
 =====
 
-Most rocSOLVER types are aliases of rocBLAS types. 
+Most rocSOLVER types are aliases of rocBLAS types.
 See rocBLAS types `here <https://rocblas.readthedocs.io/en/latest/api.html#types>`_.
 
 Definitions
@@ -312,7 +312,7 @@ rocsolver_<type>getrs_strided_batched()
 Auxiliaries
 =========================
 
-rocSOLVER auxiliary functions are aliases of rocBLAS auxiliary functions. See rocBLAS auxiliary functions 
+rocSOLVER auxiliary functions are aliases of rocBLAS auxiliary functions. See rocBLAS auxiliary functions
 `here <https://rocblas.readthedocs.io/en/latest/api.html#auxiliary>`_.
 
 rocSOLVER handle auxiliaries
diff --git a/ROCm_Libraries/rocSOLVER/Doxyfile b/ROCm_Libraries/rocSOLVER/Doxyfile
index de295523..45b8d873 100644
--- a/ROCm_Libraries/rocSOLVER/Doxyfile
+++ b/ROCm_Libraries/rocSOLVER/Doxyfile
@@ -58,7 +58,7 @@ PROJECT_LOGO           = ./rocmlogo.png
 # entered, it will be relative to the location where doxygen was started. If
 # left blank the current directory will be used.
 
-#OUTPUT_DIRECTORY       = 
+#OUTPUT_DIRECTORY       =
 
 # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
 # directories (in 2 levels) under the output directory of each output format and
@@ -162,7 +162,7 @@ FULL_PATH_NAMES        = YES
 # will be relative from the directory where doxygen is started.
 # This tag requires that the tag FULL_PATH_NAMES is set to YES.
 
-STRIP_FROM_PATH        = 
+STRIP_FROM_PATH        =
 
 # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
 # path mentioned in the documentation of a class, which tells the reader which
@@ -171,7 +171,7 @@ STRIP_FROM_PATH        =
 # specify the list of include paths that are normally passed to the compiler
 # using the -I flag.
 
-STRIP_FROM_INC_PATH    = 
+STRIP_FROM_INC_PATH    =
 
 # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
 # less readable) file names. This can be useful is your file systems doesn't
@@ -238,13 +238,13 @@ TAB_SIZE               = 4
 # "Side Effects:". You can put \n's in the value part of an alias to insert
 # newlines.
 
-ALIASES                = 
+ALIASES                =
 
 # This tag can be used to specify a number of word-keyword mappings (TCL only).
 # A mapping has the form "name=value". For example adding "class=itcl::class"
 # will allow you to use the command class in the itcl::class meaning.
 
-TCL_SUBST              = 
+TCL_SUBST              =
 
 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
@@ -291,7 +291,7 @@ OPTIMIZE_OUTPUT_VHDL   = NO
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
 # the files are not read by doxygen.
 
-EXTENSION_MAPPING      = 
+EXTENSION_MAPPING      =
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
@@ -641,7 +641,7 @@ GENERATE_DEPRECATEDLIST= YES
 # sections, marked by \if <section_label> ... \endif and \cond <section_label>
 # ... \endcond blocks.
 
-ENABLED_SECTIONS       = 
+ENABLED_SECTIONS       =
 
 # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
 # initial value of a variable or macro / define can have for it to appear in the
@@ -683,7 +683,7 @@ SHOW_NAMESPACES        = YES
 # by doxygen. Whatever the program writes to standard output is used as the file
 # version. For an example see the documentation.
 
-FILE_VERSION_FILTER    = 
+FILE_VERSION_FILTER    =
 
 # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
 # by doxygen. The layout file controls the global structure of the generated
@@ -696,7 +696,7 @@ FILE_VERSION_FILTER    =
 # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
 # tag is left empty.
 
-LAYOUT_FILE            = 
+LAYOUT_FILE            =
 
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
@@ -706,7 +706,7 @@ LAYOUT_FILE            =
 # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
 # search path. See also \cite for info how to create references.
 
-CITE_BIB_FILES         = 
+CITE_BIB_FILES         =
 
 #---------------------------------------------------------------------------
 # Configuration options related to warning and progress messages
@@ -765,7 +765,7 @@ WARN_FORMAT            = "$file:$line: $text"
 # messages should be written. If left blank the output is written to standard
 # error (stderr).
 
-WARN_LOGFILE           = 
+WARN_LOGFILE           =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the input files
@@ -858,7 +858,7 @@ RECURSIVE              = NO
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                = 
+EXCLUDE                =
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -874,7 +874,7 @@ EXCLUDE_SYMLINKS       = NO
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*
 
-EXCLUDE_PATTERNS       = 
+EXCLUDE_PATTERNS       =
 
 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
@@ -885,13 +885,13 @@ EXCLUDE_PATTERNS       =
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories use the pattern */test/*
 
-EXCLUDE_SYMBOLS        = 
+EXCLUDE_SYMBOLS        =
 
 # The EXAMPLE_PATH tag can be used to specify one or more files or directories
 # that contain example code fragments that are included (see the \include
 # command).
 
-EXAMPLE_PATH           = 
+EXAMPLE_PATH           =
 
 # If the value of the EXAMPLE_PATH tag contains directories, you can use the
 # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
@@ -911,7 +911,7 @@ EXAMPLE_RECURSIVE      = NO
 # that contain images that are to be included in the documentation (see the
 # \image command).
 
-IMAGE_PATH             = 
+IMAGE_PATH             =
 
 # The INPUT_FILTER tag can be used to specify a program that doxygen should
 # invoke to filter for each input file. Doxygen will invoke the filter program
@@ -928,7 +928,7 @@ IMAGE_PATH             =
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
 
-INPUT_FILTER           = 
+INPUT_FILTER           =
 
 # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
 # basis. Doxygen will compare the file name with each pattern and apply the
@@ -937,7 +937,7 @@ INPUT_FILTER           =
 # filters are used. If the FILTER_PATTERNS tag is empty or if none of the
 # patterns match the file name, INPUT_FILTER is applied.
 
-FILTER_PATTERNS        = 
+FILTER_PATTERNS        =
 
 # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
 # INPUT_FILTER) will also be used to filter the input files that are used for
@@ -952,7 +952,7 @@ FILTER_SOURCE_FILES    = NO
 # *.ext= (so without naming a filter).
 # This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
 
-FILTER_SOURCE_PATTERNS = 
+FILTER_SOURCE_PATTERNS =
 
 # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
 # is part of the input, its contents will be placed on the main page
@@ -1064,7 +1064,7 @@ CLANG_ASSISTED_PARSING = NO
 # specified with INPUT and INCLUDE_PATH.
 # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
 
-CLANG_OPTIONS          = 
+CLANG_OPTIONS          =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
@@ -1090,7 +1090,7 @@ COLS_IN_ALPHA_INDEX    = 5
 # while generating the index headers.
 # This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
 
-IGNORE_PREFIX          = 
+IGNORE_PREFIX          =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the HTML output
@@ -1134,7 +1134,7 @@ HTML_FILE_EXTENSION    = .html
 # of the possible markers and block names see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_HEADER            = 
+HTML_HEADER            =
 
 # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
 # generated HTML page. If the tag is left blank doxygen will generate a standard
@@ -1144,7 +1144,7 @@ HTML_HEADER            =
 # that doxygen normally uses.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_FOOTER            = 
+HTML_FOOTER            =
 
 # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
 # sheet that is used by each HTML page. It can be used to fine-tune the look of
@@ -1156,7 +1156,7 @@ HTML_FOOTER            =
 # obsolete.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_STYLESHEET        = 
+HTML_STYLESHEET        =
 
 # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # cascading style sheets that are included after the standard style sheets
@@ -1169,7 +1169,7 @@ HTML_STYLESHEET        =
 # list). For an example see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_EXTRA_STYLESHEET  = 
+HTML_EXTRA_STYLESHEET  =
 
 # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the HTML output directory. Note
@@ -1179,7 +1179,7 @@ HTML_EXTRA_STYLESHEET  =
 # files will be copied as-is; there are no commands or markers available.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_EXTRA_FILES       = 
+HTML_EXTRA_FILES       =
 
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
@@ -1308,7 +1308,7 @@ GENERATE_HTMLHELP      = NO
 # written to the html output directory.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-CHM_FILE               = 
+CHM_FILE               =
 
 # The HHC_LOCATION tag can be used to specify the location (absolute path
 # including file name) of the HTML help compiler (hhc.exe). If non-empty,
@@ -1316,7 +1316,7 @@ CHM_FILE               =
 # The file has to be specified with full path.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-HHC_LOCATION           = 
+HHC_LOCATION           =
 
 # The GENERATE_CHI flag controls if a separate .chi index file is generated
 # (YES) or that it should be included in the master .chm file (NO).
@@ -1329,7 +1329,7 @@ GENERATE_CHI           = NO
 # and project file content.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-CHM_INDEX_ENCODING     = 
+CHM_INDEX_ENCODING     =
 
 # The BINARY_TOC flag controls whether a binary table of contents is generated
 # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
@@ -1360,7 +1360,7 @@ GENERATE_QHP           = NO
 # the HTML output folder.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QCH_FILE               = 
+QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
@@ -1385,7 +1385,7 @@ QHP_VIRTUAL_FOLDER     = doc
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_CUST_FILTER_NAME   = 
+QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
@@ -1393,21 +1393,21 @@ QHP_CUST_FILTER_NAME   =
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_CUST_FILTER_ATTRS  = 
+QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
 # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_SECT_FILTER_ATTRS  = 
+QHP_SECT_FILTER_ATTRS  =
 
 # The QHG_LOCATION tag can be used to specify the location of Qt's
 # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
 # generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHG_LOCATION           = 
+QHG_LOCATION           =
 
 # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
 # generated, together with the HTML files, they form an Eclipse help plugin. To
@@ -1549,7 +1549,7 @@ MATHJAX_EXTENSIONS     = TeX/AMSmath TeX/AMSsymbols
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_CODEFILE       = 
+MATHJAX_CODEFILE       =
 
 # When the SEARCHENGINE tag is enabled doxygen will generate a search box for
 # the HTML output. The underlying search engine uses javascript and DHTML and
@@ -1609,7 +1609,7 @@ EXTERNAL_SEARCH        = NO
 # Searching" for details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-SEARCHENGINE_URL       = 
+SEARCHENGINE_URL       =
 
 # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
 # search data is written to a file for indexing by an external tool. With the
@@ -1625,7 +1625,7 @@ SEARCHDATA_FILE        = searchdata.xml
 # projects and redirect the results back to the right project.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-EXTERNAL_SEARCH_ID     = 
+EXTERNAL_SEARCH_ID     =
 
 # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
 # projects other than the one defined by this configuration file, but that are
@@ -1635,7 +1635,7 @@ EXTERNAL_SEARCH_ID     =
 # EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-EXTRA_SEARCH_MAPPINGS  = 
+EXTRA_SEARCH_MAPPINGS  =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the LaTeX output
@@ -1699,7 +1699,7 @@ PAPER_TYPE             = a4
 # If left blank no extra packages will be included.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-EXTRA_PACKAGES         = 
+EXTRA_PACKAGES         =
 
 # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
 # generated LaTeX document. The header should contain everything until the first
@@ -1715,7 +1715,7 @@ EXTRA_PACKAGES         =
 # to HTML_HEADER.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_HEADER           = 
+LATEX_HEADER           =
 
 # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
 # generated LaTeX document. The footer should contain everything after the last
@@ -1726,7 +1726,7 @@ LATEX_HEADER           =
 # Note: Only use a user-defined footer if you know what you are doing!
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_FOOTER           = 
+LATEX_FOOTER           =
 
 # The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # LaTeX style sheets that are included after the standard style sheets created
@@ -1737,7 +1737,7 @@ LATEX_FOOTER           =
 # list).
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_EXTRA_STYLESHEET = 
+LATEX_EXTRA_STYLESHEET =
 
 # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the LATEX_OUTPUT output
@@ -1745,7 +1745,7 @@ LATEX_EXTRA_STYLESHEET =
 # markers available.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_EXTRA_FILES      = 
+LATEX_EXTRA_FILES      =
 
 # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
 # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
@@ -1845,14 +1845,14 @@ RTF_HYPERLINKS         = NO
 # default style sheet that doxygen normally uses.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
-RTF_STYLESHEET_FILE    = 
+RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
 # similar to doxygen's config file. A template extensions file can be generated
 # using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
-RTF_EXTENSIONS_FILE    = 
+RTF_EXTENSIONS_FILE    =
 
 # If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
 # with syntax highlighting in the RTF output.
@@ -1897,7 +1897,7 @@ MAN_EXTENSION          = .3
 # MAN_EXTENSION with the initial . removed.
 # This tag requires that the tag GENERATE_MAN is set to YES.
 
-MAN_SUBDIR             = 
+MAN_SUBDIR             =
 
 # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
 # will generate one additional man file for each entity documented in the real
@@ -2010,7 +2010,7 @@ PERLMOD_PRETTY         = YES
 # overwrite each other's variables.
 # This tag requires that the tag GENERATE_PERLMOD is set to YES.
 
-PERLMOD_MAKEVAR_PREFIX = 
+PERLMOD_MAKEVAR_PREFIX =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the preprocessor
@@ -2051,7 +2051,7 @@ SEARCH_INCLUDES        = YES
 # preprocessor.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
-INCLUDE_PATH           = 
+INCLUDE_PATH           =
 
 # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
 # patterns (like *.h and *.hpp) to filter out the header-files in the
@@ -2059,7 +2059,7 @@ INCLUDE_PATH           =
 # used.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-INCLUDE_FILE_PATTERNS  = 
+INCLUDE_FILE_PATTERNS  =
 
 # The PREDEFINED tag can be used to specify one or more macro names that are
 # defined before the preprocessor is started (similar to the -D option of e.g.
@@ -2069,7 +2069,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = 
+PREDEFINED             =
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
@@ -2078,7 +2078,7 @@ PREDEFINED             =
 # definition found in the source code.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-EXPAND_AS_DEFINED      = 
+EXPAND_AS_DEFINED      =
 
 # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
 # remove all references to function-like macros that are alone on a line, have
@@ -2107,13 +2107,13 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.
 
-TAGFILES               = 
+TAGFILES               =
 
 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to
 # external documentation" for more information about the usage of tag files.
 
-GENERATE_TAGFILE       = 
+GENERATE_TAGFILE       =
 
 # If the ALLEXTERNALS tag is set to YES, all external class will be listed in
 # the class index. If set to NO, only the inherited external classes will be
@@ -2162,14 +2162,14 @@ CLASS_DIAGRAMS         = NO
 # the mscgen tool resides. If left empty the tool is assumed to be found in the
 # default search path.
 
-MSCGEN_PATH            = 
+MSCGEN_PATH            =
 
 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
 # If left empty dia is assumed to be found in the default search path.
 
-DIA_PATH               = 
+DIA_PATH               =
 
 # If set to YES the inheritance and collaboration graphs will hide inheritance
 # and usage relations if the target is undocumented or is not a class.
@@ -2218,7 +2218,7 @@ DOT_FONTSIZE           = 10
 # the path where dot can find it using this tag.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_FONTPATH           = 
+DOT_FONTPATH           =
 
 # If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
 # each documented class showing the direct and indirect inheritance relations.
@@ -2362,26 +2362,26 @@ INTERACTIVE_SVG        = NO
 # found. If left blank, it is assumed the dot tool can be found in the path.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_PATH               = 
+DOT_PATH               =
 
 # The DOTFILE_DIRS tag can be used to specify one or more directories that
 # contain dot files that are included in the documentation (see the \dotfile
 # command).
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOTFILE_DIRS           = 
+DOTFILE_DIRS           =
 
 # The MSCFILE_DIRS tag can be used to specify one or more directories that
 # contain msc files that are included in the documentation (see the \mscfile
 # command).
 
-MSCFILE_DIRS           = 
+MSCFILE_DIRS           =
 
 # The DIAFILE_DIRS tag can be used to specify one or more directories that
 # contain dia files that are included in the documentation (see the \diafile
 # command).
 
-DIAFILE_DIRS           = 
+DIAFILE_DIRS           =
 
 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
 # path where java can find the plantuml.jar file. If left blank, it is assumed
@@ -2389,12 +2389,12 @@ DIAFILE_DIRS           =
 # generate a warning when it encounters a \startuml command in this case and
 # will not generate output for the diagram.
 
-PLANTUML_JAR_PATH      = 
+PLANTUML_JAR_PATH      =
 
 # When using plantuml, the specified paths are searched for files specified by
 # the !include statement in a plantuml block.
 
-PLANTUML_INCLUDE_PATH  = 
+PLANTUML_INCLUDE_PATH  =
 
 # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
 # that will be shown in the graph. If the number of nodes in a graph becomes
diff --git a/ROCm_Libraries/rocSOLVER/Introduction.rst b/ROCm_Libraries/rocSOLVER/Introduction.rst
index 5d75fcda..a98d401a 100644
--- a/ROCm_Libraries/rocSOLVER/Introduction.rst
+++ b/ROCm_Libraries/rocSOLVER/Introduction.rst
@@ -1,14 +1,14 @@
 .. toctree::
-   :maxdepth: 4 
+   :maxdepth: 4
    :caption: Contents:
 
 *************
 Introduction
 *************
 
-rocSOLVER is a library of Lapack routines on top of AMD’s Radeon Open Compute Platform (ROCm) runtime and toolchains. 
-rocSOLVER is implemented in the HIP programming language and based on an optimized BLAS 
-implementation for AMD’s latest discrete GPUs. 
+rocSOLVER is a library of Lapack routines on top of AMD's Radeon Open Compute Platform (ROCm) runtime and toolchains.
+rocSOLVER is implemented in the HIP programming language and based on an optimized BLAS
+implementation for AMD's latest discrete GPUs.
 
 For more information about rocBLAS, see `rocBLAS
 <https://rocblas.readthedocs.io/en/latest/index.html>`_.
@@ -19,9 +19,9 @@ Build and Install
 Prerequisites
 --------------
 
-For installation, rocSOLVER requires `cmake <https://cmake.org/install/>`_ 
-and `ROCm <https://rocm.github.io/install.html>`_, including 
-`hip <https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md>`_ and 
+For installation, rocSOLVER requires `cmake <https://cmake.org/install/>`_
+and `ROCm <https://rocm.github.io/install.html>`_, including
+`hip <https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md>`_ and
 `rocBLAS <https://github.com/ROCmSoftwarePlatform/rocBLAS>`_
 
 
@@ -31,7 +31,7 @@ Installation
 Follow the instructions below to build and install rocSOLVER:
 
 .. code-block:: bash
-   
+
     mkdir build && cd build
     CXX=/opt/rocm/bin/hcc cmake ..
     make
@@ -48,48 +48,48 @@ The following table summarizes the LAPACK functionality implemented in rocSOLVER
 =============================== ====== ====== ============== ==============
 Lapack Auxiliary Function       single double single complex double complex
 =============================== ====== ====== ============== ==============
-**rocsolver_laswp**             x      x         x              x 
-**rocsolver_larfg**             x      x                        
+**rocsolver_laswp**             x      x         x              x
+**rocsolver_larfg**             x      x
 **rocsolver_larft**             x      x
 **rocsolver_larf**              x      x
-**rocsolver_larfb**             x      x      
-**rocsolver_org2r**             x      x      
-**rocsolver_orgqr**             x      x      
-**rocsolver_orgl2**             x      x      
-**rocsolver_orglq**             x      x      
-**rocsolver_orgbr**             x      x      
-**rocsolver_orm2r**             x      x      
-**rocsolver_ormqr**             x      x      
+**rocsolver_larfb**             x      x
+**rocsolver_org2r**             x      x
+**rocsolver_orgqr**             x      x
+**rocsolver_orgl2**             x      x
+**rocsolver_orglq**             x      x
+**rocsolver_orgbr**             x      x
+**rocsolver_orm2r**             x      x
+**rocsolver_ormqr**             x      x
 =============================== ====== ====== ============== ==============
 
 =============================== ====== ====== ============== ==============
 Lapack Function                 single double single complex double complex
 =============================== ====== ====== ============== ==============
-**rocsolver_potf2**             x      x                        
-rocsolver_potf2_batched         x      x                       
-rocsolver_potf2_strided_batched x      x                       
-**rocsolver_potrf**             x      x                        
-rocsolver_potrf_batched         x      x                       
-rocsolver_potrf_strided_batched x      x                       
+**rocsolver_potf2**             x      x
+rocsolver_potf2_batched         x      x
+rocsolver_potf2_strided_batched x      x
+**rocsolver_potrf**             x      x
+rocsolver_potrf_batched         x      x
+rocsolver_potrf_strided_batched x      x
 **rocsolver_getf2**             x      x          x             x
 rocsolver_getf2_batched         x      x          x             x
 rocsolver_getf2_strided_batched x      x          x             x
-**rocsolver_getrf**             x      x          x             x 
+**rocsolver_getrf**             x      x          x             x
 rocsolver_getrf_batched         x      x          x             x
 rocsolver_getrf_strided_batched x      x          x             x
-**rocsolver_geqr2**             x      x                        
+**rocsolver_geqr2**             x      x
 rocsolver_geqr2_batched         x      x
 rocsolver_geqr2_strided_batched x      x
-**rocsolver_geqrf**             x      x                        
-rocsolver_geqrf_batched         x      x 
+**rocsolver_geqrf**             x      x
+rocsolver_geqrf_batched         x      x
 rocsolver_geqrf_strided_batched x      x
-**rocsolver_gelq2**             x      x                        
+**rocsolver_gelq2**             x      x
 rocsolver_gelq2_batched         x      x
 rocsolver_gelq2_strided_batched x      x
-**rocsolver_gelqf**             x      x                        
-rocsolver_gelqf_batched         x      x 
+**rocsolver_gelqf**             x      x
+rocsolver_gelqf_batched         x      x
 rocsolver_gelqf_strided_batched x      x
-**rocsolver_getrs**             x      x          x             x 
+**rocsolver_getrs**             x      x          x             x
 rocsolver_getrs_batched         x      x          x             x
 rocsolver_getrs_strided_batched x      x          x             x
 =============================== ====== ====== ============== ==============
@@ -97,30 +97,30 @@ rocsolver_getrs_strided_batched x      x          x             x
 Benchmarking and Testing
 ==========================
 
-For testing and benchmarking, rocSOLVER has a basic/preliminary infrastructure similar to rocBLAS. 
+For testing and benchmarking, rocSOLVER has a basic/preliminary infrastructure similar to rocBLAS.
 
-On a normal installation, clients are located in the directory **<rocsolverDIR>/build/clients/staging**. 
+On a normal installation, clients are located in the directory **<rocsolverDIR>/build/clients/staging**.
 
 **rocsolver-test** executes a suite of `Google tests <https://github.com/google/googletest>`_ (*gtest*) that verifies the correct
-functioning of the library; the results computed by rocSOLVER, for random input data, are compared with the results computed by 
+functioning of the library; the results computed by rocSOLVER, for random input data, are compared with the results computed by
 `NETLib LAPACK <http://www.netlib.org/lapack/>`_ on the CPU.
 
 Calling the rocSOLVER gtest client with the --help flag
 
 .. code-block:: bash
-    
+
     ./rocsolver-test --help
 
-returns information on different flags that control the behavior of the gtests.   
+returns information on different flags that control the behavior of the gtests.
 
 **rocsolver-bench** allows to run any rocSOLVER function with random data of the specified dimensions; it compares the computed results, and provides basic
-performance information (as for now, execution times). 
+performance information (as for now, execution times).
 
-Similarly, 
+Similarly,
 
 .. code-block:: bash
-    
+
     ./rocsolver-bench --help
 
-returns information on how to use the rocSOLVER benchmark client.   
- 
+returns information on how to use the rocSOLVER benchmark client.
+
diff --git a/ROCm_Libraries/rocSOLVER/Jenkinsfile b/ROCm_Libraries/rocSOLVER/Jenkinsfile
index e8d0d1de..7c9b42b0 100644
--- a/ROCm_Libraries/rocSOLVER/Jenkinsfile
+++ b/ROCm_Libraries/rocSOLVER/Jenkinsfile
@@ -26,8 +26,8 @@ rocSOLVERCI:
 {
 
     def rocsolver = new rocProject('rocSOLVER')
-    
-    def nodes = new dockerNodes(['internal && gfx900 && ubuntu16', 'internal && gfx906 && ubuntu16', 'internal && gfx906 && centos7', 
+
+    def nodes = new dockerNodes(['internal && gfx900 && ubuntu16', 'internal && gfx906 && ubuntu16', 'internal && gfx906 && centos7',
     'internal && gfx900 && centos7','internal && gfx900 && ubuntu16 && hip-clang', 'internal && gfx906 && ubuntu16 && hip-clang',
     'internal && gfx900 && sles', 'internal && gfx906 && sles'], rocsolver)
 
@@ -43,7 +43,7 @@ rocSOLVERCI:
         String compiler = platform.jenkinsLabel.contains('hip-clang') ? 'hipcc' : 'hcc'
         String branch = platform.jenkinsLabel.contains('hip-clang') ? 'hip-clang' : 'develop'
 	    String build_command = "${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/${compiler} -Damd_comgr_DIR=/opt/rocm/lib/cmake/amd_comgr .."
-        
+
         def getRocBLAS = auxiliary.getLibrary('rocBLAS',platform.jenkinsLabel,branch)
         def command = """#!/usr/bin/env bash
                     set -x
@@ -81,7 +81,7 @@ rocSOLVERCI:
         finally
         {
             junit "${project.paths.project_build_prefix}/build/clients/staging/*.xml"
-        }        
+        }
     }
 
     def packageCommand =
@@ -90,7 +90,7 @@ rocSOLVERCI:
 
         String branch = platform.jenkinsLabel.contains('hip-clang') ? 'hip-clang' : 'develop'
         def getRocBLAS = auxiliary.getLibrary('rocBLAS',platform.jenkinsLabel,branch)
-        def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build",false,getRocBLAS)  
+        def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build",false,getRocBLAS)
 
         platform.runCommand(this, packageHelper[0])
         platform.archiveArtifacts(this, packageHelper[1])
diff --git a/ROCm_Libraries/rocSOLVER/LICENSE.md b/ROCm_Libraries/rocSOLVER/LICENSE.md
index 6f3eab60..22991b38 100644
--- a/ROCm_Libraries/rocSOLVER/LICENSE.md
+++ b/ROCm_Libraries/rocSOLVER/LICENSE.md
@@ -1,4 +1,4 @@
-Copyright © 2018 Advanced Micro Devices, Inc.  
+Copyright (C) 2018 Advanced Micro Devices, Inc.
 
 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 
diff --git a/ROCm_Libraries/rocSOLVER/bump_develop_version.sh b/ROCm_Libraries/rocSOLVER/bump_develop_version.sh
index 6d6f9f44..f30aa8b5 100644
--- a/ROCm_Libraries/rocSOLVER/bump_develop_version.sh
+++ b/ROCm_Libraries/rocSOLVER/bump_develop_version.sh
@@ -2,8 +2,8 @@
 
 # This script needs to be edited to bump new master version to new develop for new release.
 # - run this script after running bump_master_version.sh and merging develop into master
-# - run this script in master branch 
-# - after running this script merge master into develop 
+# - run this script in master branch
+# - after running this script merge master into develop
 
 OLD_ROCSOLVER_VERSION="0.0.1"
 NEW_ROCSOLVER_VERSION="0.0.2"
diff --git a/ROCm_Libraries/rocSOLVER/bump_master_version.sh b/ROCm_Libraries/rocSOLVER/bump_master_version.sh
index d6da7160..9c42e6b2 100644
--- a/ROCm_Libraries/rocSOLVER/bump_master_version.sh
+++ b/ROCm_Libraries/rocSOLVER/bump_master_version.sh
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 # This script needs to be edited to bump old develop version to new master version for new release.
-# - run this script in develop branch 
+# - run this script in develop branch
 # - after running this script merge develop into master
 # - after running this script and merging develop into master, run bump_develop_version.sh in master and
 #   merge master into develop
diff --git a/ROCm_Libraries/rocSOLVER/cmake/get-cli-arguments.cmake b/ROCm_Libraries/rocSOLVER/cmake/get-cli-arguments.cmake
index 110bcfaa..5483d094 100644
--- a/ROCm_Libraries/rocSOLVER/cmake/get-cli-arguments.cmake
+++ b/ROCm_Libraries/rocSOLVER/cmake/get-cli-arguments.cmake
@@ -22,4 +22,4 @@ function( append_cmake_cli_arguments initial_cli_args return_cli_args )
   # message( STATUS "get_command_line_arguments: ${cli_args}")
   set( ${return_cli_args} ${${initial_cli_args}} ${cli_args} PARENT_SCOPE )
 
-endfunction( )
\ No newline at end of file
+endfunction( )
diff --git a/ROCm_Libraries/rocSOLVER/debian/postinst b/ROCm_Libraries/rocSOLVER/debian/postinst
index 8675688f..36acd581 100644
--- a/ROCm_Libraries/rocSOLVER/debian/postinst
+++ b/ROCm_Libraries/rocSOLVER/debian/postinst
@@ -1,4 +1,3 @@
 
             echo "/opt/rocm/lib" > /etc/ld.so.conf.d/rocsolver-alt.conf
             ldconfig
-        
\ No newline at end of file
diff --git a/ROCm_Libraries/rocSOLVER/debian/prerm b/ROCm_Libraries/rocSOLVER/debian/prerm
index 0d084f2c..748f5a80 100644
--- a/ROCm_Libraries/rocSOLVER/debian/prerm
+++ b/ROCm_Libraries/rocSOLVER/debian/prerm
@@ -1,4 +1,3 @@
 
             rm /etc/ld.so.conf.d/rocsolver-alt.conf
             ldconfig
-        
\ No newline at end of file
diff --git a/ROCm_Libraries/rocSOLVER/deps/external-lapack.cmake b/ROCm_Libraries/rocSOLVER/deps/external-lapack.cmake
index 7355eb98..6dc43477 100644
--- a/ROCm_Libraries/rocSOLVER/deps/external-lapack.cmake
+++ b/ROCm_Libraries/rocSOLVER/deps/external-lapack.cmake
@@ -39,7 +39,7 @@ ExternalProject_Add(
 )
 # The fortran flag '-fno-optimize-sibling-calls' has been added as a workaround for a known bug
 # that causes incompatibility issues between gfortran and C lapack calls for gfortran versions 7,8 and 9
-# The ticket can be tracked at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90329 
+# The ticket can be tracked at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90329
 
 
 ExternalProject_Get_Property( lapack source_dir )
diff --git a/ROCm_Libraries/rocSOLVER/docs/Doxyfile b/ROCm_Libraries/rocSOLVER/docs/Doxyfile
index c41190c8..d9539384 100644
--- a/ROCm_Libraries/rocSOLVER/docs/Doxyfile
+++ b/ROCm_Libraries/rocSOLVER/docs/Doxyfile
@@ -162,7 +162,7 @@ FULL_PATH_NAMES        = YES
 # will be relative from the directory where doxygen is started.
 # This tag requires that the tag FULL_PATH_NAMES is set to YES.
 
-STRIP_FROM_PATH        = 
+STRIP_FROM_PATH        =
 
 # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
 # path mentioned in the documentation of a class, which tells the reader which
@@ -171,7 +171,7 @@ STRIP_FROM_PATH        =
 # specify the list of include paths that are normally passed to the compiler
 # using the -I flag.
 
-STRIP_FROM_INC_PATH    = 
+STRIP_FROM_INC_PATH    =
 
 # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
 # less readable) file names. This can be useful is your file systems doesn't
@@ -238,13 +238,13 @@ TAB_SIZE               = 4
 # "Side Effects:". You can put \n's in the value part of an alias to insert
 # newlines.
 
-ALIASES                = 
+ALIASES                =
 
 # This tag can be used to specify a number of word-keyword mappings (TCL only).
 # A mapping has the form "name=value". For example adding "class=itcl::class"
 # will allow you to use the command class in the itcl::class meaning.
 
-TCL_SUBST              = 
+TCL_SUBST              =
 
 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
@@ -291,7 +291,7 @@ OPTIMIZE_OUTPUT_VHDL   = NO
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
 # the files are not read by doxygen.
 
-EXTENSION_MAPPING      = 
+EXTENSION_MAPPING      =
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
@@ -639,7 +639,7 @@ GENERATE_DEPRECATEDLIST= YES
 # sections, marked by \if <section_label> ... \endif and \cond <section_label>
 # ... \endcond blocks.
 
-ENABLED_SECTIONS       = 
+ENABLED_SECTIONS       =
 
 # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
 # initial value of a variable or macro / define can have for it to appear in the
@@ -681,7 +681,7 @@ SHOW_NAMESPACES        = YES
 # by doxygen. Whatever the program writes to standard output is used as the file
 # version. For an example see the documentation.
 
-FILE_VERSION_FILTER    = 
+FILE_VERSION_FILTER    =
 
 # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
 # by doxygen. The layout file controls the global structure of the generated
@@ -694,7 +694,7 @@ FILE_VERSION_FILTER    =
 # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
 # tag is left empty.
 
-LAYOUT_FILE            = 
+LAYOUT_FILE            =
 
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
@@ -704,7 +704,7 @@ LAYOUT_FILE            =
 # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
 # search path. See also \cite for info how to create references.
 
-CITE_BIB_FILES         = 
+CITE_BIB_FILES         =
 
 #---------------------------------------------------------------------------
 # Configuration options related to warning and progress messages
@@ -763,7 +763,7 @@ WARN_FORMAT            = "$file:$line: $text"
 # messages should be written. If left blank the output is written to standard
 # error (stderr).
 
-WARN_LOGFILE           = 
+WARN_LOGFILE           =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the input files
@@ -856,7 +856,7 @@ RECURSIVE              = NO
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                = 
+EXCLUDE                =
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -872,7 +872,7 @@ EXCLUDE_SYMLINKS       = NO
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*
 
-EXCLUDE_PATTERNS       = 
+EXCLUDE_PATTERNS       =
 
 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
@@ -883,13 +883,13 @@ EXCLUDE_PATTERNS       =
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories use the pattern */test/*
 
-EXCLUDE_SYMBOLS        = 
+EXCLUDE_SYMBOLS        =
 
 # The EXAMPLE_PATH tag can be used to specify one or more files or directories
 # that contain example code fragments that are included (see the \include
 # command).
 
-EXAMPLE_PATH           = 
+EXAMPLE_PATH           =
 
 # If the value of the EXAMPLE_PATH tag contains directories, you can use the
 # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
@@ -909,7 +909,7 @@ EXAMPLE_RECURSIVE      = NO
 # that contain images that are to be included in the documentation (see the
 # \image command).
 
-IMAGE_PATH             = 
+IMAGE_PATH             =
 
 # The INPUT_FILTER tag can be used to specify a program that doxygen should
 # invoke to filter for each input file. Doxygen will invoke the filter program
@@ -926,7 +926,7 @@ IMAGE_PATH             =
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
 
-INPUT_FILTER           = 
+INPUT_FILTER           =
 
 # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
 # basis. Doxygen will compare the file name with each pattern and apply the
@@ -935,7 +935,7 @@ INPUT_FILTER           =
 # filters are used. If the FILTER_PATTERNS tag is empty or if none of the
 # patterns match the file name, INPUT_FILTER is applied.
 
-FILTER_PATTERNS        = 
+FILTER_PATTERNS        =
 
 # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
 # INPUT_FILTER) will also be used to filter the input files that are used for
@@ -950,7 +950,7 @@ FILTER_SOURCE_FILES    = NO
 # *.ext= (so without naming a filter).
 # This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
 
-FILTER_SOURCE_PATTERNS = 
+FILTER_SOURCE_PATTERNS =
 
 # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
 # is part of the input, its contents will be placed on the main page
@@ -1062,7 +1062,7 @@ CLANG_ASSISTED_PARSING = NO
 # specified with INPUT and INCLUDE_PATH.
 # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
 
-CLANG_OPTIONS          = 
+CLANG_OPTIONS          =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
@@ -1088,7 +1088,7 @@ COLS_IN_ALPHA_INDEX    = 5
 # while generating the index headers.
 # This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
 
-IGNORE_PREFIX          = 
+IGNORE_PREFIX          =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the HTML output
@@ -1132,7 +1132,7 @@ HTML_FILE_EXTENSION    = .html
 # of the possible markers and block names see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_HEADER            = 
+HTML_HEADER            =
 
 # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
 # generated HTML page. If the tag is left blank doxygen will generate a standard
@@ -1142,7 +1142,7 @@ HTML_HEADER            =
 # that doxygen normally uses.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_FOOTER            = 
+HTML_FOOTER            =
 
 # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
 # sheet that is used by each HTML page. It can be used to fine-tune the look of
@@ -1154,7 +1154,7 @@ HTML_FOOTER            =
 # obsolete.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_STYLESHEET        = 
+HTML_STYLESHEET        =
 
 # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # cascading style sheets that are included after the standard style sheets
@@ -1167,7 +1167,7 @@ HTML_STYLESHEET        =
 # list). For an example see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_EXTRA_STYLESHEET  = 
+HTML_EXTRA_STYLESHEET  =
 
 # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the HTML output directory. Note
@@ -1177,7 +1177,7 @@ HTML_EXTRA_STYLESHEET  =
 # files will be copied as-is; there are no commands or markers available.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_EXTRA_FILES       = 
+HTML_EXTRA_FILES       =
 
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
@@ -1306,7 +1306,7 @@ GENERATE_HTMLHELP      = NO
 # written to the html output directory.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-CHM_FILE               = 
+CHM_FILE               =
 
 # The HHC_LOCATION tag can be used to specify the location (absolute path
 # including file name) of the HTML help compiler (hhc.exe). If non-empty,
@@ -1314,7 +1314,7 @@ CHM_FILE               =
 # The file has to be specified with full path.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-HHC_LOCATION           = 
+HHC_LOCATION           =
 
 # The GENERATE_CHI flag controls if a separate .chi index file is generated
 # (YES) or that it should be included in the master .chm file (NO).
@@ -1327,7 +1327,7 @@ GENERATE_CHI           = NO
 # and project file content.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-CHM_INDEX_ENCODING     = 
+CHM_INDEX_ENCODING     =
 
 # The BINARY_TOC flag controls whether a binary table of contents is generated
 # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
@@ -1358,7 +1358,7 @@ GENERATE_QHP           = NO
 # the HTML output folder.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QCH_FILE               = 
+QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
@@ -1383,7 +1383,7 @@ QHP_VIRTUAL_FOLDER     = doc
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_CUST_FILTER_NAME   = 
+QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
@@ -1391,21 +1391,21 @@ QHP_CUST_FILTER_NAME   =
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_CUST_FILTER_ATTRS  = 
+QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
 # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_SECT_FILTER_ATTRS  = 
+QHP_SECT_FILTER_ATTRS  =
 
 # The QHG_LOCATION tag can be used to specify the location of Qt's
 # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
 # generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHG_LOCATION           = 
+QHG_LOCATION           =
 
 # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
 # generated, together with the HTML files, they form an Eclipse help plugin. To
@@ -1538,7 +1538,7 @@ MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
 # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_EXTENSIONS     = 
+MATHJAX_EXTENSIONS     =
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
@@ -1546,7 +1546,7 @@ MATHJAX_EXTENSIONS     =
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_CODEFILE       = 
+MATHJAX_CODEFILE       =
 
 # When the SEARCHENGINE tag is enabled doxygen will generate a search box for
 # the HTML output. The underlying search engine uses javascript and DHTML and
@@ -1606,7 +1606,7 @@ EXTERNAL_SEARCH        = NO
 # Searching" for details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-SEARCHENGINE_URL       = 
+SEARCHENGINE_URL       =
 
 # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
 # search data is written to a file for indexing by an external tool. With the
@@ -1622,7 +1622,7 @@ SEARCHDATA_FILE        = searchdata.xml
 # projects and redirect the results back to the right project.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-EXTERNAL_SEARCH_ID     = 
+EXTERNAL_SEARCH_ID     =
 
 # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
 # projects other than the one defined by this configuration file, but that are
@@ -1632,7 +1632,7 @@ EXTERNAL_SEARCH_ID     =
 # EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-EXTRA_SEARCH_MAPPINGS  = 
+EXTRA_SEARCH_MAPPINGS  =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the LaTeX output
@@ -1696,7 +1696,7 @@ PAPER_TYPE             = a4
 # If left blank no extra packages will be included.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-EXTRA_PACKAGES         = 
+EXTRA_PACKAGES         =
 
 # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
 # generated LaTeX document. The header should contain everything until the first
@@ -1712,7 +1712,7 @@ EXTRA_PACKAGES         =
 # to HTML_HEADER.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_HEADER           = 
+LATEX_HEADER           =
 
 # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
 # generated LaTeX document. The footer should contain everything after the last
@@ -1723,7 +1723,7 @@ LATEX_HEADER           =
 # Note: Only use a user-defined footer if you know what you are doing!
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_FOOTER           = 
+LATEX_FOOTER           =
 
 # The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # LaTeX style sheets that are included after the standard style sheets created
@@ -1734,7 +1734,7 @@ LATEX_FOOTER           =
 # list).
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_EXTRA_STYLESHEET = 
+LATEX_EXTRA_STYLESHEET =
 
 # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the LATEX_OUTPUT output
@@ -1742,7 +1742,7 @@ LATEX_EXTRA_STYLESHEET =
 # markers available.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_EXTRA_FILES      = 
+LATEX_EXTRA_FILES      =
 
 # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
 # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
@@ -1842,14 +1842,14 @@ RTF_HYPERLINKS         = NO
 # default style sheet that doxygen normally uses.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
-RTF_STYLESHEET_FILE    = 
+RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
 # similar to doxygen's config file. A template extensions file can be generated
 # using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
-RTF_EXTENSIONS_FILE    = 
+RTF_EXTENSIONS_FILE    =
 
 # If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
 # with syntax highlighting in the RTF output.
@@ -1894,7 +1894,7 @@ MAN_EXTENSION          = .3
 # MAN_EXTENSION with the initial . removed.
 # This tag requires that the tag GENERATE_MAN is set to YES.
 
-MAN_SUBDIR             = 
+MAN_SUBDIR             =
 
 # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
 # will generate one additional man file for each entity documented in the real
@@ -2007,7 +2007,7 @@ PERLMOD_PRETTY         = YES
 # overwrite each other's variables.
 # This tag requires that the tag GENERATE_PERLMOD is set to YES.
 
-PERLMOD_MAKEVAR_PREFIX = 
+PERLMOD_MAKEVAR_PREFIX =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the preprocessor
@@ -2048,7 +2048,7 @@ SEARCH_INCLUDES        = YES
 # preprocessor.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
-INCLUDE_PATH           = 
+INCLUDE_PATH           =
 
 # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
 # patterns (like *.h and *.hpp) to filter out the header-files in the
@@ -2056,7 +2056,7 @@ INCLUDE_PATH           =
 # used.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-INCLUDE_FILE_PATTERNS  = 
+INCLUDE_FILE_PATTERNS  =
 
 # The PREDEFINED tag can be used to specify one or more macro names that are
 # defined before the preprocessor is started (similar to the -D option of e.g.
@@ -2066,7 +2066,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = 
+PREDEFINED             =
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
@@ -2075,7 +2075,7 @@ PREDEFINED             =
 # definition found in the source code.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-EXPAND_AS_DEFINED      = 
+EXPAND_AS_DEFINED      =
 
 # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
 # remove all references to function-like macros that are alone on a line, have
@@ -2104,13 +2104,13 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.
 
-TAGFILES               = 
+TAGFILES               =
 
 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to
 # external documentation" for more information about the usage of tag files.
 
-GENERATE_TAGFILE       = 
+GENERATE_TAGFILE       =
 
 # If the ALLEXTERNALS tag is set to YES, all external class will be listed in
 # the class index. If set to NO, only the inherited external classes will be
@@ -2159,14 +2159,14 @@ CLASS_DIAGRAMS         = NO
 # the mscgen tool resides. If left empty the tool is assumed to be found in the
 # default search path.
 
-MSCGEN_PATH            = 
+MSCGEN_PATH            =
 
 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
 # If left empty dia is assumed to be found in the default search path.
 
-DIA_PATH               = 
+DIA_PATH               =
 
 # If set to YES the inheritance and collaboration graphs will hide inheritance
 # and usage relations if the target is undocumented or is not a class.
@@ -2215,7 +2215,7 @@ DOT_FONTSIZE           = 10
 # the path where dot can find it using this tag.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_FONTPATH           = 
+DOT_FONTPATH           =
 
 # If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
 # each documented class showing the direct and indirect inheritance relations.
@@ -2359,26 +2359,26 @@ INTERACTIVE_SVG        = NO
 # found. If left blank, it is assumed the dot tool can be found in the path.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_PATH               = 
+DOT_PATH               =
 
 # The DOTFILE_DIRS tag can be used to specify one or more directories that
 # contain dot files that are included in the documentation (see the \dotfile
 # command).
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOTFILE_DIRS           = 
+DOTFILE_DIRS           =
 
 # The MSCFILE_DIRS tag can be used to specify one or more directories that
 # contain msc files that are included in the documentation (see the \mscfile
 # command).
 
-MSCFILE_DIRS           = 
+MSCFILE_DIRS           =
 
 # The DIAFILE_DIRS tag can be used to specify one or more directories that
 # contain dia files that are included in the documentation (see the \diafile
 # command).
 
-DIAFILE_DIRS           = 
+DIAFILE_DIRS           =
 
 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
 # path where java can find the plantuml.jar file. If left blank, it is assumed
@@ -2386,12 +2386,12 @@ DIAFILE_DIRS           =
 # generate a warning when it encounters a \startuml command in this case and
 # will not generate output for the diagram.
 
-PLANTUML_JAR_PATH      = 
+PLANTUML_JAR_PATH      =
 
 # When using plantuml, the specified paths are searched for files specified by
 # the !include statement in a plantuml block.
 
-PLANTUML_INCLUDE_PATH  = 
+PLANTUML_INCLUDE_PATH  =
 
 # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
 # that will be shown in the graph. If the number of nodes in a graph becomes
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/include/rocsolver-functions.h b/ROCm_Libraries/rocSOLVER/docs/library/include/rocsolver-functions.h
index cd388512..3fbbfaf4 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/include/rocsolver-functions.h
+++ b/ROCm_Libraries/rocSOLVER/docs/library/include/rocsolver-functions.h
@@ -42,7 +42,7 @@ extern "C" {
     n               rocsolver_int. n >= 0.\n
                     The number of columns of the matrix A.
     @param[inout]
-    A               pointer to type. Array on the GPU of dimension lda*n. \n 
+    A               pointer to type. Array on the GPU of dimension lda*n. \n
                     On entry, the matrix of column dimension n to which the row
                     interchanges will be applied. On exit, the permuted matrix.
     @param[in]
@@ -59,7 +59,7 @@ extern "C" {
     @param[in]
     ipiv            pointer to rocsolver_int. Array on the GPU of dimension at least k1 + (k2 - k1) * abs(incx).\n
                     The vector of pivot indices.  Only the elements in positions
-                    k1 through (k1 + (k2 - k1) * abs(incx)) of IPIV are accessed. 
+                    k1 through (k1 + (k2 - k1) * abs(incx)) of IPIV are accessed.
                     Elements of ipiv are considered 1-based.
     @param[in]
     incx            rocsolver_int. incx != 0.\n
@@ -67,92 +67,92 @@ extern "C" {
                     is negative, the pivots are applied in reverse order.
     *************************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_slaswp(rocsolver_handle handle, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_slaswp(rocsolver_handle handle,
                                                    const rocsolver_int n,
-                                                   float *A, 
-                                                   const rocsolver_int lda, 
-                                                   const rocsolver_int k1, 
-                                                   const rocsolver_int k2, 
-                                                   const rocsolver_int *ipiv, 
+                                                   float *A,
+                                                   const rocsolver_int lda,
+                                                   const rocsolver_int k1,
+                                                   const rocsolver_int k2,
+                                                   const rocsolver_int *ipiv,
                                                    const rocsolver_int incx);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dlaswp(rocsolver_handle handle, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dlaswp(rocsolver_handle handle,
                                                    const rocsolver_int n,
-                                                   double *A, 
-                                                   const rocsolver_int lda, 
-                                                   const rocsolver_int k1, 
-                                                   const rocsolver_int k2, 
-                                                   const rocsolver_int *ipiv, 
+                                                   double *A,
+                                                   const rocsolver_int lda,
+                                                   const rocsolver_int k1,
+                                                   const rocsolver_int k2,
+                                                   const rocsolver_int *ipiv,
                                                    const rocsolver_int incx);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_claswp(rocsolver_handle handle, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_claswp(rocsolver_handle handle,
                                                    const rocsolver_int n,
-                                                   rocblas_float_complex *A, 
-                                                   const rocsolver_int lda, 
-                                                   const rocsolver_int k1, 
-                                                   const rocsolver_int k2, 
-                                                   const rocsolver_int *ipiv, 
+                                                   rocblas_float_complex *A,
+                                                   const rocsolver_int lda,
+                                                   const rocsolver_int k1,
+                                                   const rocsolver_int k2,
+                                                   const rocsolver_int *ipiv,
                                                    const rocsolver_int incx);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_zlaswp(rocsolver_handle handle, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_zlaswp(rocsolver_handle handle,
                                                    const rocsolver_int n,
-                                                   rocblas_double_complex *A, 
-                                                   const rocsolver_int lda, 
-                                                   const rocsolver_int k1, 
-                                                   const rocsolver_int k2, 
-                                                   const rocsolver_int *ipiv, 
+                                                   rocblas_double_complex *A,
+                                                   const rocsolver_int lda,
+                                                   const rocsolver_int k1,
+                                                   const rocsolver_int k2,
+                                                   const rocsolver_int *ipiv,
                                                    const rocsolver_int incx);
 
-/*! \brief LARFG generates an orthogonal Householder reflector H of order n. 
+/*! \brief LARFG generates an orthogonal Householder reflector H of order n.
 
     \details
     Householder reflector H is such that
- 
+
         H * [alpha] = [beta]
             [  x  ]   [  0 ]
 
-    where x is an n-1 vector and alpha and beta are scalars. Matrix H can be 
+    where x is an n-1 vector and alpha and beta are scalars. Matrix H can be
     generated as
-    
+
         H = I - tau * [1] * [1 v']
                       [v]
 
-    with v an n-1 vector and tau a scalar. 
+    with v an n-1 vector and tau a scalar.
 
     @param[in]
     handle          rocsolver_handle
     @param[in]
     n               rocsolver_int. n >= 0.\n
-                    The order (size) of reflector H. 
+                    The order (size) of reflector H.
     @param[inout]
     alpha           pointer to type. A scalar on the GPU.\n
-                    On input the scalar alpha, 
+                    On input the scalar alpha,
                     on output it is overwritten with beta.
-    @param[inout]      
+    @param[inout]
     x               pointer to type. Array on the GPU of size at least n-1.\n
-                    On input it is the vector x, 
+                    On input it is the vector x,
                     on output it is overwritten with vector v.
     @param[in]
     incx            rocsolver_int. incx > 0.\n
-                    The increment between consecutive elements of x. 
+                    The increment between consecutive elements of x.
     @param[out]
     tau             pointer to type. A scalar on the GPU.\n
                     The scalar tau.
 
     *************************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_slarfg(rocsolver_handle handle, 
-                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_slarfg(rocsolver_handle handle,
+                                                 const rocsolver_int n,
                                                  float *alpha,
-                                                 float *x, 
-                                                 const rocsolver_int incx, 
+                                                 float *x,
+                                                 const rocsolver_int incx,
                                                  float *tau);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle, 
-                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle,
+                                                 const rocsolver_int n,
                                                  double *alpha,
-                                                 double *x, 
-                                                 const rocsolver_int incx, 
+                                                 double *x,
+                                                 const rocsolver_int incx,
                                                  double *tau);
 
 
@@ -164,9 +164,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle,
         H = H(1) * H(2) * ... * H(k)  (forward direction), or
         H = H(k) * ... * H(2) * H(1)  (backward direction)
 
-    depending on the value of direct.  
+    depending on the value of direct.
 
-    The triangular matrix T is upper triangular in forward direction and lower triangular in backward direction. 
+    The triangular matrix T is upper triangular in forward direction and lower triangular in backward direction.
     If storev is column-wise, then
 
         H = I - V * T * V'
@@ -175,7 +175,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle,
 
         H = I - V' * T * V
 
-    where the i-th row of matrix V contains the Householder vector associated to H(i). 
+    where the i-th row of matrix V contains the Householder vector associated to H(i).
 
     @param[in]
     handle              rocsolver_handle.
@@ -188,10 +188,10 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle,
     @param[in]
     n                   rocsolver_int. n >= 0.\n
                         The order (size) of the block reflector.
-    @param[in]          
+    @param[in]
     k                   rocsovler_int. k >= 1.\n
                         The number of Householder matrices.
-    @param[in]          
+    @param[in]
     V                   pointer to type. Array on the GPU of size ldv*k if column-wise, or ldv*n if row-wise.\n
                         The matrix of Householder vectors.
     @param[in]
@@ -203,44 +203,44 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle,
     @param[out]
     T                   pointer to type. Array on the GPU of dimension ldt*k.\n
                         The triangular factor. T is upper triangular is forward operation, otherwise it is lower triangular.
-                        The rest of the array is not used. 
-    @param[in]  
+                        The rest of the array is not used.
+    @param[in]
     ldt                 rocsolver_int. ldt >= k.\n
                         The leading dimension of T.
 
-    **************************************************************************/ 
+    **************************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_slarft(rocsolver_handle handle,
-                                                 const rocsolver_direct direct, 
+                                                 const rocsolver_direct direct,
                                                  const rocsolver_storev storev,
-                                                 const rocsolver_int n, 
+                                                 const rocsolver_int n,
                                                  const rocsolver_int k,
                                                  float *V,
                                                  const rocsolver_int ldv,
                                                  float *tau,
-                                                 float *T, 
-                                                 const rocsolver_int ldt); 
+                                                 float *T,
+                                                 const rocsolver_int ldt);
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarft(rocsolver_handle handle,
                                                  const rocsolver_direct direct,
-                                                 const rocsolver_storev storev, 
-                                                 const rocsolver_int n, 
+                                                 const rocsolver_storev storev,
+                                                 const rocsolver_int n,
                                                  const rocsolver_int k,
                                                  double *V,
                                                  const rocsolver_int ldv,
                                                  double *tau,
-                                                 double *T, 
-                                                 const rocsolver_int ldt); 
+                                                 double *T,
+                                                 const rocsolver_int ldt);
 
 
 /*! \brief LARF applies a Householder reflector H to a general matrix A.
 
     \details
     The Householder reflector H, of order m (or n), is to be applied to a m-by-n matrix A
-    from the left (or the right). H is given by 
+    from the left (or the right). H is given by
 
         H = I - alpha * x * x'
-    
+
     where alpha is a scalar and x a Householder vector. H is never actually computed.
 
     @param[in]
@@ -254,16 +254,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarft(rocsolver_handle handle,
                     Number of rows of A.
     @param[in]
     n               rocsolver_int. n >= 0.\n
-                    Number of columns of A. 
+                    Number of columns of A.
     @param[in]
-    x               pointer to type. Array on the GPU of  
+    x               pointer to type. Array on the GPU of
                     size at least (1 + (m-1)*abs(incx)) if left side, or
                     at least (1 + (n-1)*abs(incx)) if right side.\n
                     The Householder vector x.
     @param[in]
     incx            rocsolver_int. incx != 0.\n
-                    Increment between to consecutive elements of x. 
-                    If incx < 0, the elements of x are used in reverse order. 
+                    Increment between to consecutive elements of x.
+                    If incx < 0, the elements of x are used in reverse order.
     @param[in]
     alpha           pointer to type. A scalar on the GPU.\n
                     If alpha = 0, then H = I (A will remain the same, x is never used)
@@ -273,35 +273,35 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarft(rocsolver_handle handle,
                     H*A (or A*H).
     @param[in]
     lda             rocsolver_int. lda >= m.\n
-                    Leading dimension of A. 
-                        
+                    Leading dimension of A.
+
     *************************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_slarf(rocsolver_handle handle, 
-                                                const rocsolver_side side, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_slarf(rocsolver_handle handle,
+                                                const rocsolver_side side,
                                                 const rocsolver_int m,
-                                                const rocsolver_int n, 
-                                                float* x, 
-                                                const rocsolver_int incx, 
+                                                const rocsolver_int n,
+                                                float* x,
+                                                const rocsolver_int incx,
                                                 const float* alpha,
-                                                float* A, 
+                                                float* A,
                                                 const rocsolver_int lda);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarf(rocsolver_handle handle, 
-                                                const rocsolver_side side, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarf(rocsolver_handle handle,
+                                                const rocsolver_side side,
                                                 const rocsolver_int m,
-                                                const rocsolver_int n, 
-                                                double* x, 
-                                                const rocsolver_int incx, 
+                                                const rocsolver_int n,
+                                                double* x,
+                                                const rocsolver_int incx,
                                                 const double* alpha,
-                                                double* A, 
+                                                double* A,
                                                 const rocsolver_int lda);
 
 
 /*! \brief LARFB applies a block reflector H to a general m-by-n matrix A.
 
     \details
-    The block reflector H is applied in one of the following forms, depending on 
+    The block reflector H is applied in one of the following forms, depending on
     the values of side and trans:
 
         H  * A  (No transpose from the left)
@@ -322,7 +322,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarf(rocsolver_handle handle,
 
         H = I - V' * T * V
 
-    where the i-th row of matrix V contains the Householder vector associated to H(i), if storev is row-wise. 
+    where the i-th row of matrix V contains the Householder vector associated to H(i), if storev is row-wise.
     T is the associated triangular factor as computed by LARFT.
 
     @param[in]
@@ -345,11 +345,11 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarf(rocsolver_handle handle,
     @param[in]
     n                   rocsolver_int. n >= 0.\n
                         Number of columns of matrix A.
-    @param[in]          
+    @param[in]
     k                   rocsovler_int. k >= 1.\n
                         The number of Householder matrices.
-    @param[in]          
-    V                   pointer to type. Array on the GPU of size ldv*k if column-wise, ldv*n if row-wise and applying from the right, 
+    @param[in]
+    V                   pointer to type. Array on the GPU of size ldv*k if column-wise, ldv*n if row-wise and applying from the right,
                         or ldv*m if row-wise and applying from the left.\n
                         The matrix of Householder vectors.
     @param[in]
@@ -359,16 +359,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarf(rocsolver_handle handle,
     @param[in]
     T                   pointer to type. Array on the GPU of dimension ldt*k.\n
                         The triangular factor of the block reflector.
-    @param[in]  
+    @param[in]
     ldt                 rocsolver_int. ldt >= k.\n
                         The leading dimension of T.
     @param[inout]
     A                   pointer to type. Array on the GPU of size lda*n.\n
                         On input, the matrix A. On output it is overwritten with
-                        H*A, A*H, H'*A, or A*H'.  
+                        H*A, A*H, H'*A, or A*H'.
     @param[in]
     lda                 rocsolver_int. lda >= m.\n
-                        Leading dimension of A. 
+                        Leading dimension of A.
 
     ****************************************************************************/
 
@@ -376,31 +376,31 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_slarfb(rocsolver_handle handle,
                                                  const rocsolver_side side,
                                                  const rocsolver_operation trans,
                                                  const rocsolver_direct direct,
-                                                 const rocsolver_storev storev, 
+                                                 const rocsolver_storev storev,
                                                  const rocsolver_int m,
-                                                 const rocsolver_int n, 
+                                                 const rocsolver_int n,
                                                  const rocsolver_int k,
                                                  float *V,
                                                  const rocsolver_int ldv,
-                                                 float *T, 
+                                                 float *T,
                                                  const rocsolver_int ldt,
                                                  float *A,
-                                                 const rocsolver_int lda); 
+                                                 const rocsolver_int lda);
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfb(rocsolver_handle handle,
                                                  const rocsolver_side side,
                                                  const rocsolver_operation trans,
-                                                 const rocsolver_direct direct, 
-                                                 const rocsolver_storev storev, 
+                                                 const rocsolver_direct direct,
+                                                 const rocsolver_storev storev,
                                                  const rocsolver_int m,
-                                                 const rocsolver_int n, 
+                                                 const rocsolver_int n,
                                                  const rocsolver_int k,
                                                  double *V,
                                                  const rocsolver_int ldv,
-                                                 double *T, 
+                                                 double *T,
                                                  const rocsolver_int ldt,
                                                  double *A,
-                                                 const rocsolver_int lda); 
+                                                 const rocsolver_int lda);
 
 /*! \brief ORG2R generates a m-by-n Matrix Q with orthonormal columns.
 
@@ -409,17 +409,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfb(rocsolver_handle handle,
 
     The matrix Q is defined as the first n columns of the product of k Householder
     reflectors of order m
-    
+
         Q = H(1) * H(2) * ... * H(k)
 
-    Householder matrices H(i) are never stored, they are computed from its corresponding 
+    Householder matrices H(i) are never stored, they are computed from its corresponding
     Householder vector v(i) and scalar ipiv_i as returned by GEQRF.
 
     @param[in]
     handle      rocsolver_handle.
     @param[in]
     m           rocsolver_int. m >= 0.\n
-                The number of rows of the matrix Q. 
+                The number of rows of the matrix Q.
     @param[in]
     n           rocsolver_int. 0 <= n <= m.\n
                 The number of colums of the matrix Q.
@@ -433,7 +433,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfb(rocsolver_handle handle,
                 On exit, the computed matrix Q.
     @param[in]
     lda         rocsolver_int. lda >= m.\n
-                Specifies the leading dimension of A. 
+                Specifies the leading dimension of A.
     @param[in]
     ipiv        pointer to type. Array on the GPU of dimension at least k.\n
                 The scalar factors of the Householder matrices H(i) as returned by GEQRF.
@@ -442,16 +442,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfb(rocsolver_handle handle,
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sorg2r(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    float *ipiv);
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dorg2r(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    double *ipiv);
@@ -463,17 +463,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorg2r(rocsolver_handle handle,
 
     The matrix Q is defined as the first n columns of the product of k Householder
     reflectors of order m
-    
+
         Q = H(1) * H(2) * ... * H(k)
 
-    Householder matrices H(i) are never stored, they are computed from its corresponding 
+    Householder matrices H(i) are never stored, they are computed from its corresponding
     Householder vector v(i) and scalar ipiv_i as returned by GEQRF.
 
     @param[in]
     handle      rocsolver_handle.
     @param[in]
     m           rocsolver_int. m >= 0.\n
-                The number of rows of the matrix Q. 
+                The number of rows of the matrix Q.
     @param[in]
     n           rocsolver_int. 0 <= n <= m.\n
                 The number of colums of the matrix Q.
@@ -487,7 +487,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorg2r(rocsolver_handle handle,
                 On exit, the computed matrix Q.
     @param[in]
     lda         rocsolver_int. lda >= m.\n
-                Specifies the leading dimension of A. 
+                Specifies the leading dimension of A.
     @param[in]
     ipiv        pointer to type. Array on the GPU of dimension at least k.\n
                 The scalar factors of the Householder matrices H(i) as returned by GEQRF.
@@ -496,16 +496,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorg2r(rocsolver_handle handle,
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sorgqr(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    float *ipiv);
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgqr(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    double *ipiv);
@@ -517,17 +517,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgqr(rocsolver_handle handle,
 
     The matrix Q is defined as the first m rows of the product of k Householder
     reflectors of order n
-    
+
         Q = H(k) * H(k-1) * ... * H(1)
 
-    Householder matrices H(i) are never stored, they are computed from its corresponding 
+    Householder matrices H(i) are never stored, they are computed from its corresponding
     Householder vector v(i) and scalar ipiv_i as returned by GELQF.
 
     @param[in]
     handle      rocsolver_handle.
     @param[in]
     m           rocsolver_int. 0 <= m <= n.\n
-                The number of rows of the matrix Q. 
+                The number of rows of the matrix Q.
     @param[in]
     n           rocsolver_int. n >= 0.\n
                 The number of colums of the matrix Q.
@@ -541,7 +541,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgqr(rocsolver_handle handle,
                 On exit, the computed matrix Q.
     @param[in]
     lda         rocsolver_int. lda >= m.\n
-                Specifies the leading dimension of A. 
+                Specifies the leading dimension of A.
     @param[in]
     ipiv        pointer to type. Array on the GPU of dimension at least k.\n
                 The scalar factors of the Householder matrices H(i) as returned by GELQF.
@@ -550,16 +550,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgqr(rocsolver_handle handle,
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sorgl2(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    float *ipiv);
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgl2(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    double *ipiv);
@@ -572,17 +572,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgl2(rocsolver_handle handle,
 
     The matrix Q is defined as the first m rows of the product of k Householder
     reflectors of order n
-    
+
         Q = H(k) * H(k-1) * ... * H(1)
 
-    Householder matrices H(i) are never stored, they are computed from its corresponding 
+    Householder matrices H(i) are never stored, they are computed from its corresponding
     Householder vector v(i) and scalar ipiv_i as returned by GELQF.
 
     @param[in]
     handle      rocsolver_handle.
     @param[in]
     m           rocsolver_int. 0 <= m <= n.\n
-                The number of rows of the matrix Q. 
+                The number of rows of the matrix Q.
     @param[in]
     n           rocsolver_int. n >= 0.\n
                 The number of colums of the matrix Q.
@@ -596,7 +596,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgl2(rocsolver_handle handle,
                 On exit, the computed matrix Q.
     @param[in]
     lda         rocsolver_int. lda >= m.\n
-                Specifies the leading dimension of A. 
+                Specifies the leading dimension of A.
     @param[in]
     ipiv        pointer to type. Array on the GPU of dimension at least k.\n
                 The scalar factors of the Householder matrices H(i) as returned by GELQF.
@@ -605,16 +605,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgl2(rocsolver_handle handle,
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sorglq(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    float *ipiv);
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    double *ipiv);
@@ -622,9 +622,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle,
 /*! \brief ORGBR generates a m-by-n Matrix Q with orthonormal rows or columns.
 
     \details
-    If storev is column-wise, then the matrix Q has orthonormal columns. If m >= k, Q is defined as the first 
+    If storev is column-wise, then the matrix Q has orthonormal columns. If m >= k, Q is defined as the first
     n columns of the product of k Householder reflectors of order m
-    
+
         Q = H(1) * H(2) * ... * H(k)
 
     If m < k, Q is defined as the product of Householder reflectors of order m
@@ -635,12 +635,12 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle,
     first m rows of the product of k Householder reflectors of order n
 
         Q = H(k) * H(k-1) * ... * H(1)
-    
+
     If n <= k, Q is defined as the product of Householder reflectors of order n
 
         Q = H(n-1) * H(n-2) * ... * H(1)
 
-    The Householder matrices H(i) are never stored, they are computed from its corresponding 
+    The Householder matrices H(i) are never stored, they are computed from its corresponding
     Householder vector v(i) and scalar ipiv_i as returned by GEBRD.
 
     @param[in]
@@ -650,12 +650,12 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle,
                 Specifies whether to work column-wise or row-wise.
     @param[in]
     m           rocsolver_int. m >= 0.\n
-                The number of rows of the matrix Q. 
+                The number of rows of the matrix Q.
                 If row-wise, then min(n,k) <= m <= n.
     @param[in]
     n           rocsolver_int. n >= 0.\n
-                The number of colums of the matrix Q. 
-                If column-wise, then min(m,k) <= n <= m. 
+                The number of colums of the matrix Q.
+                If column-wise, then min(m,k) <= n <= m.
     @param[in]
     k           rocsolver_int. k >= 0.\n
                 The number of columns (if storev is colum-wise) or rows (if row-wise) of the
@@ -667,7 +667,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle,
                 On exit, the computed matrix Q.
     @param[in]
     lda         rocsolver_int. lda >= m.\n
-                Specifies the leading dimension of A. 
+                Specifies the leading dimension of A.
     @param[in]
     ipiv        pointer to type. Array on the GPU of dimension min(m,k) if column-wise, or min(n,k) if row-wise.\n
                 The scalar factors of the Householder matrices H(i) as returned by GEBRD.
@@ -677,8 +677,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle,
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sorgbr(rocsolver_handle handle,
                                                    const rocsolver_storev storev,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    float *ipiv);
@@ -686,8 +686,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sorgbr(rocsolver_handle handle,
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgbr(rocsolver_handle handle,
                                                    const rocsolver_storev storev,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    double *ipiv);
@@ -696,8 +696,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgbr(rocsolver_handle handle,
 
     \details
     (This is the unblocked version of the algorithm).
-    
-    The matrix Q is applied in one of the following forms, depending on 
+
+    The matrix Q is applied in one of the following forms, depending on
     the values of side and trans:
 
         Q  * C  (No transpose from the left)
@@ -709,7 +709,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgbr(rocsolver_handle handle,
 
         Q = H(1) * H(2) * ... * H(k)
 
-    or order m if applying from the left, or n if applying from the right. Q is never stored, it is 
+    or order m if applying from the left, or n if applying from the right. Q is never stored, it is
     calculated from the Householder vectors and scalars returned by the QR factorization GEQRF.
 
     @param[in]
@@ -726,10 +726,10 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgbr(rocsolver_handle handle,
     @param[in]
     n                   rocsolver_int. n >= 0.\n
                         Number of columns of matrix C.
-    @param[in]          
+    @param[in]
     k                   rocsovler_int. k >= 0; k <= m if side is left, k <= n if side is right.\n
                         The number of Householder reflectors that form Q.
-    @param[in]          
+    @param[in]
     A                   pointer to type. Array on the GPU of size lda*k.\n
                         The i-th column has the Householder vector v(i) associated with H(i) as returned by GEQRF
                         in the first k columns of its argument A.
@@ -742,19 +742,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgbr(rocsolver_handle handle,
     @param[inout]
     C                   pointer to type. Array on the GPU of size ldc*n.\n
                         On input, the matrix C. On output it is overwritten with
-                        Q*C, C*Q, Q'*C, or C*Q'.  
+                        Q*C, C*Q, Q'*C, or C*Q'.
     @param[in]
     lda                 rocsolver_int. ldc >= m.\n
-                        Leading dimension of C. 
-     
+                        Leading dimension of C.
+
     ****************************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sorm2r(rocsolver_handle handle,
                                                    const rocsolver_side side,
                                                    const rocsolver_operation trans,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    float *ipiv,
@@ -765,8 +765,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorm2r(rocsolver_handle handle,
                                                    const rocsolver_side side,
                                                    const rocsolver_operation trans,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    double *ipiv,
@@ -777,8 +777,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorm2r(rocsolver_handle handle,
 
     \details
     (This is the blocked version of the algorithm).
-    
-    The matrix Q is applied in one of the following forms, depending on 
+
+    The matrix Q is applied in one of the following forms, depending on
     the values of side and trans:
 
         Q  * C  (No transpose from the left)
@@ -790,7 +790,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorm2r(rocsolver_handle handle,
 
         Q = H(1) * H(2) * ... * H(k)
 
-    or order m if applying from the left, or n if applying from the right. Q is never stored, it is 
+    or order m if applying from the left, or n if applying from the right. Q is never stored, it is
     calculated from the Householder vectors and scalars returned by the QR factorization GEQRF.
 
     @param[in]
@@ -807,10 +807,10 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorm2r(rocsolver_handle handle,
     @param[in]
     n                   rocsolver_int. n >= 0.\n
                         Number of columns of matrix C.
-    @param[in]          
+    @param[in]
     k                   rocsovler_int. k >= 0; k <= m if side is left, k <= n if side is right.\n
                         The number of Householder reflectors that form Q.
-    @param[in]          
+    @param[in]
     A                   pointer to type. Array on the GPU of size lda*k.\n
                         The i-th column has the Householder vector v(i) associated with H(i) as returned by GEQRF
                         in the first k columns of its argument A.
@@ -823,19 +823,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorm2r(rocsolver_handle handle,
     @param[inout]
     C                   pointer to type. Array on the GPU of size ldc*n.\n
                         On input, the matrix C. On output it is overwritten with
-                        Q*C, C*Q, Q'*C, or C*Q'.  
+                        Q*C, C*Q, Q'*C, or C*Q'.
     @param[in]
     lda                 rocsolver_int. ldc >= m.\n
-                        Leading dimension of C. 
-     
+                        Leading dimension of C.
+
     ****************************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sormqr(rocsolver_handle handle,
                                                    const rocsolver_side side,
                                                    const rocsolver_operation trans,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    float *ipiv,
@@ -846,8 +846,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dormqr(rocsolver_handle handle,
                                                    const rocsolver_side side,
                                                    const rocsolver_operation trans,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    double *ipiv,
@@ -880,10 +880,10 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dormqr(rocsolver_handle handle,
     handle    rocsolver_handle.
     @param[in]
     m         rocsolver_int. m >= 0.\n
-              The number of rows of the matrix A. 
+              The number of rows of the matrix A.
     @param[in]
     n         rocsolver_int. n >= 0.\n
-              The number of colums of the matrix A. 
+              The number of colums of the matrix A.
     @param[inout]
     A         pointer to type. Array on the GPU of dimension lda*n.\n
               On entry, the m-by-n matrix A to be factored.
@@ -891,7 +891,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dormqr(rocsolver_handle handle,
               The unit diagonal elements of L are not stored.
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of A. 
+              Specifies the leading dimension of A.
     @param[out]
     ipiv      pointer to rocsolver_int. Array on the GPU of dimension min(m,n).\n
               The vector of pivot indices. Elements of ipiv are 1-based indices.
@@ -900,14 +900,14 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dormqr(rocsolver_handle handle,
               Matrix P of the factorization can be derived from ipiv.
     @param[out]
     info      pointer to a rocsolver_int on the GPU.\n
-              If info = 0, succesful exit. 
+              If info = 0, succesful exit.
               If info = i > 0, U is singular. U(i,i) is the first zero pivot.
-            
+
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -915,7 +915,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2(rocsolver_handle handle,
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -923,7 +923,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2(rocsolver_handle handle,
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_float_complex *A,
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -931,7 +931,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2(rocsolver_handle handle,
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_double_complex *A,
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -968,8 +968,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2(rocsolver_handle handle,
     lda       rocsolver_int. lda >= m.\n
               Specifies the leading dimension of matrices A_i.
     @param[out]
-    ipiv      pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n 
-              Contains the vectors of pivot indices ipiv_i (corresponding to A_i). 
+    ipiv      pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n
+              Contains the vectors of pivot indices ipiv_i (corresponding to A_i).
               Dimension of ipiv_i is min(m,n).
               Elements of ipiv_i are 1-based indices.
               For each instance A_i in the batch and for 1 <= j <= min(m,n), the row j of the
@@ -981,17 +981,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2(rocsolver_handle handle,
               There is no restriction for the value of strideP. Normal use case is strideP >= min(m,n).
     @param[out]
     info      pointer to rocsolver_int. Array of batch_count integers on the GPU.\n
-              If info_i = 0, succesful exit for factorization of A_i. 
+              If info_i = 0, succesful exit for factorization of A_i.
               If info_i = j > 0, U_i is singular. U_i(j,j) is the first zero pivot.
     @param[in]
     batch_count rocsolver_int. batch_count >= 0.\n
-                Number of matrices in the batch. 
-            
+                Number of matrices in the batch.
+
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    float *const A[],
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1001,7 +1001,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2_batched(rocsolver_handle hand
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    double *const A[],
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1011,7 +1011,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2_batched(rocsolver_handle hand
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_float_complex *const A[],
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1021,7 +1021,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2_batched(rocsolver_handle hand
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_double_complex *const A[],
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1034,7 +1034,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_batched(rocsolver_handle hand
 
     \details
     (This is the right-looking Level 2 BLAS version of the algorithm).
-    
+
     The factorization of matrix A_i in the batch has the form
 
         A_i = P_i * L_i * U_i
@@ -1064,8 +1064,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_batched(rocsolver_handle hand
               Stride from the start of one matrix A_i and the next one A_(i+1).
               There is no restriction for the value of strideA. Normal use case is strideA >= lda*n
     @param[out]
-    ipiv      pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n 
-              Contains the vectors of pivots indices ipiv_i (corresponding to A_i). 
+    ipiv      pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n
+              Contains the vectors of pivots indices ipiv_i (corresponding to A_i).
               Dimension of ipiv_i is min(m,n).
               Elements of ipiv_i are 1-based indices.
               For each instance A_i in the batch and for 1 <= j <= min(m,n), the row j of the
@@ -1077,17 +1077,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_batched(rocsolver_handle hand
               There is no restriction for the value of strideP. Normal use case is strideP >= min(m,n).
     @param[out]
     info      pointer to rocsolver_int. Array of batch_count integers on the GPU.\n
-              If info_i = 0, succesful exit for factorization of A_i. 
+              If info_i = 0, succesful exit for factorization of A_i.
               If info_i = j > 0, U_i is singular. U_i(j,j) is the first zero pivot.
     @param[in]
     batch_count rocsolver_int. batch_count >= 0.\n
-                Number of matrices in the batch. 
-            
+                Number of matrices in the batch.
+
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2_strided_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    const rocsolver_int strideA,
@@ -1098,7 +1098,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2_strided_batched(rocsolver_han
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2_strided_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    const rocsolver_int strideA,
@@ -1109,7 +1109,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2_strided_batched(rocsolver_han
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2_strided_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_float_complex *A,
                                                    const rocsolver_int lda,
                                                    const rocsolver_int strideA,
@@ -1120,7 +1120,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2_strided_batched(rocsolver_han
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_strided_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_double_complex *A,
                                                    const rocsolver_int lda,
                                                    const rocsolver_int strideA,
@@ -1147,10 +1147,10 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_strided_batched(rocsolver_han
     handle    rocsolver_handle.
     @param[in]
     m         rocsolver_int. m >= 0.\n
-              The number of rows of the matrix A. 
+              The number of rows of the matrix A.
     @param[in]
     n         rocsolver_int. n >= 0.\n
-              The number of colums of the matrix A. 
+              The number of colums of the matrix A.
     @param[inout]
     A         pointer to type. Array on the GPU of dimension lda*n.\n
               On entry, the m-by-n matrix A to be factored.
@@ -1158,7 +1158,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_strided_batched(rocsolver_han
               The unit diagonal elements of L are not stored.
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of A. 
+              Specifies the leading dimension of A.
     @param[out]
     ipiv      pointer to rocsolver_int. Array on the GPU of dimension min(m,n).\n
               The vector of pivot indices. Elements of ipiv are 1-based indices.
@@ -1167,14 +1167,14 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_strided_batched(rocsolver_han
               Matrix P of the factorization can be derived from ipiv.
     @param[out]
     info      pointer to a rocsolver_int on the GPU.\n
-              If info = 0, succesful exit. 
+              If info = 0, succesful exit.
               If info = i > 0, U is singular. U(i,i) is the first zero pivot.
-            
+
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1182,7 +1182,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf(rocsolver_handle handle,
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1190,7 +1190,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf(rocsolver_handle handle,
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_float_complex *A,
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1198,7 +1198,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf(rocsolver_handle handle,
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_double_complex *A,
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1235,8 +1235,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf(rocsolver_handle handle,
     lda       rocsolver_int. lda >= m.\n
               Specifies the leading dimension of matrices A_i.
     @param[out]
-    ipiv      pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n 
-              Contains the vectors of pivot indices ipiv_i (corresponding to A_i). 
+    ipiv      pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n
+              Contains the vectors of pivot indices ipiv_i (corresponding to A_i).
               Dimension of ipiv_i is min(m,n).
               Elements of ipiv_i are 1-based indices.
               For each instance A_i in the batch and for 1 <= j <= min(m,n), the row j of the
@@ -1248,17 +1248,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf(rocsolver_handle handle,
               There is no restriction for the value of strideP. Normal use case is strideP >= min(m,n).
     @param[out]
     info      pointer to rocsolver_int. Array of batch_count integers on the GPU.\n
-              If info_i = 0, succesful exit for factorization of A_i. 
+              If info_i = 0, succesful exit for factorization of A_i.
               If info_i = j > 0, U_i is singular. U_i(j,j) is the first zero pivot.
     @param[in]
     batch_count rocsolver_int. batch_count >= 0.\n
-                Number of matrices in the batch. 
-            
+                Number of matrices in the batch.
+
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    float *const A[],
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1268,7 +1268,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf_batched(rocsolver_handle hand
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    double *const A[],
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1278,7 +1278,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf_batched(rocsolver_handle hand
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_float_complex *const A[],
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1288,7 +1288,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf_batched(rocsolver_handle hand
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_double_complex *const A[],
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1301,7 +1301,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_batched(rocsolver_handle hand
 
     \details
     (This is the right-looking Level 3 BLAS version of the algorithm).
-    
+
     The factorization of matrix A_i in the batch has the form
 
         A_i = P_i * L_i * U_i
@@ -1331,8 +1331,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_batched(rocsolver_handle hand
               Stride from the start of one matrix A_i and the next one A_(i+1).
               There is no restriction for the value of strideA. Normal use case is strideA >= lda*n
     @param[out]
-    ipiv      pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n 
-              Contains the vectors of pivots indices ipiv_i (corresponding to A_i). 
+    ipiv      pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n
+              Contains the vectors of pivots indices ipiv_i (corresponding to A_i).
               Dimension of ipiv_i is min(m,n).
               Elements of ipiv_i are 1-based indices.
               For each instance A_i in the batch and for 1 <= j <= min(m,n), the row j of the
@@ -1344,17 +1344,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_batched(rocsolver_handle hand
               There is no restriction for the value of strideP. Normal use case is strideP >= min(m,n).
     @param[out]
     info      pointer to rocsolver_int. Array of batch_count integers on the GPU.\n
-              If info_i = 0, succesful exit for factorization of A_i. 
+              If info_i = 0, succesful exit for factorization of A_i.
               If info_i = j > 0, U_i is singular. U_i(j,j) is the first zero pivot.
     @param[in]
     batch_count rocsolver_int. batch_count >= 0.\n
-                Number of matrices in the batch. 
-            
+                Number of matrices in the batch.
+
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf_strided_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    const rocsolver_int strideA,
@@ -1365,7 +1365,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf_strided_batched(rocsolver_han
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf_strided_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    const rocsolver_int strideA,
@@ -1376,7 +1376,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf_strided_batched(rocsolver_han
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf_strided_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_float_complex *A,
                                                    const rocsolver_int lda,
                                                    const rocsolver_int strideA,
@@ -1387,7 +1387,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf_strided_batched(rocsolver_han
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_strided_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_double_complex *A,
                                                    const rocsolver_int lda,
                                                    const rocsolver_int strideA,
@@ -1406,7 +1406,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_strided_batched(rocsolver_han
         A =  Q * [ R ]
                  [ 0 ]
 
-    where R is upper triangular (upper trapezoidal if m < n), and Q is 
+    where R is upper triangular (upper trapezoidal if m < n), and Q is
     a m-by-m orthogonal matrix represented as the product of Householder matrices
 
         Q = H(1) * H(2) * ... * H(k), with k = min(m,n)
@@ -1414,8 +1414,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_strided_batched(rocsolver_han
     Each Householder matrix H(i), for i = 1,2,...,k, is given by
 
         H(i) = I - ipiv[i-1] * v(i) * v(i)'
-    
-    where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. 
+
+    where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -1428,30 +1428,30 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_strided_batched(rocsolver_han
     @param[inout]
     A         pointer to type. Array on the GPU of dimension lda*n.\n
               On entry, the m-by-n matrix to be factored.
-              On exit, the elements on and above the diagonal contain the 
+              On exit, the elements on and above the diagonal contain the
               factor R; the elements below the diagonal are the m - i elements
               of vector v(i) for i = 1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of A. 
+              Specifies the leading dimension of A.
     @param[out]
     ipiv      pointer to type. Array on the GPU of dimension min(m,n).\n
               The scalar factors of the Householder matrices H(i).
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2(rocsolver_handle handle, 
-                                                 const rocsolver_int m, 
-                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2(rocsolver_handle handle,
+                                                 const rocsolver_int m,
+                                                 const rocsolver_int n,
                                                  float *A,
-                                                 const rocsolver_int lda, 
+                                                 const rocsolver_int lda,
                                                  float *ipiv);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle, 
-                                                 const rocsolver_int m, 
-                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle,
+                                                 const rocsolver_int m,
+                                                 const rocsolver_int n,
                                                  double *A,
-                                                 const rocsolver_int lda, 
+                                                 const rocsolver_int lda,
                                                  double *ipiv);
 
 /*! \brief GEQR2_BATCHED computes the QR factorization of a batch of general m-by-n matrices.
@@ -1464,7 +1464,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle,
         A_j =  Q_j * [ R_j ]
                      [  0  ]
 
-    where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is 
+    where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is
     a m-by-m orthogonal matrix represented as the product of Householder matrices
 
         Q_j = H_j(1) * H_j(2) * ... * H_j(k), with k = min(m,n)
@@ -1473,7 +1473,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle,
 
         H_j(i) = I - ipiv_j[i-1] * v_j(i) * v_j(i)'
 
-    where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. 
+    where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -1486,19 +1486,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle,
     @param[inout]
     A         Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n
               On entry, the m-by-n matrices A_j to be factored.
-              On exit, the elements on and above the diagonal contain the 
+              On exit, the elements on and above the diagonal contain the
               factor R_j. The elements below the diagonal are the m - i elements
               of vector v_j(i) for i=1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of matrices A_j. 
+              Specifies the leading dimension of matrices A_j.
     @param[out]
     ipiv      pointer to type. Array on the GPU (the size depends on the value of strideP).\n
-              Contains the vectors ipiv_j of scalar factors of the 
+              Contains the vectors ipiv_j of scalar factors of the
               Householder matrices H_j(i).
     @param[in]
     strideP   rocsolver_int.\n
-              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). 
+              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1).
               There is no restriction for the value
               of strideP. Normal use is strideP >= min(m,n).
     @param[in]
@@ -1507,22 +1507,22 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle,
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2_batched(rocsolver_handle handle, 
-                                                         const rocsolver_int m, 
-                                                         const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2_batched(rocsolver_handle handle,
+                                                         const rocsolver_int m,
+                                                         const rocsolver_int n,
                                                          float *const A[],
-                                                         const rocsolver_int lda, 
-                                                         float *ipiv, 
-                                                         const rocsolver_int strideP, 
+                                                         const rocsolver_int lda,
+                                                         float *ipiv,
+                                                         const rocsolver_int strideP,
                                                          const rocsolver_int batch_count);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle handle, 
-                                                         const rocsolver_int m, 
-                                                         const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle handle,
+                                                         const rocsolver_int m,
+                                                         const rocsolver_int n,
                                                          double *const A[],
-                                                         const rocsolver_int lda, 
-                                                         double *ipiv, 
-                                                         const rocsolver_int strideP, 
+                                                         const rocsolver_int lda,
+                                                         double *ipiv,
+                                                         const rocsolver_int strideP,
                                                          const rocsolver_int batch_count);
 
 /*! \brief GEQR2_STRIDED_BATCHED computes the QR factorization of a batch of general m-by-n matrices.
@@ -1533,9 +1533,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle hand
     The factorization of matrix A_j in the batch has the form
 
         A_j =  Q_j * [ R_j ]
-                     [  0  ] 
+                     [  0  ]
 
-    where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is 
+    where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is
     a m-by-m orthogonal matrix represented as the product of Householder matrices
 
         Q_j = H_j(1) * H_j(2) * ... * H_j(k), with k = min(m,n)
@@ -1544,7 +1544,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle hand
 
         H_j(i) = I - ipiv_j[i-1] * v_j(i) * v_j(i)'
 
-    where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. 
+    where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -1557,23 +1557,23 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle hand
     @param[inout]
     A         pointer to type. Array on the GPU (the size depends on the value of strideA).\n
               On entry, the m-by-n matrices A_j to be factored.
-              On exit, the elements on and above the diagonal contain the 
+              On exit, the elements on and above the diagonal contain the
               factor R_j. The elements below the diagonal are the m - i elements
               of vector v_j(i) for i = 1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of matrices A_j. 
+              Specifies the leading dimension of matrices A_j.
     @param[in]
-    strideA   rocsolver_int.\n   
-              Stride from the start of one matrix A_j and the next one A_(j+1). 
+    strideA   rocsolver_int.\n
+              Stride from the start of one matrix A_j and the next one A_(j+1).
               There is no restriction for the value of strideA. Normal use case is strideA >= lda*n.
     @param[out]
     ipiv      pointer to type. Array on the GPU (the size depends on the value of strideP).\n
-              Contains the vectors ipiv_j of scalar factors of the 
+              Contains the vectors ipiv_j of scalar factors of the
               Householder matrices H_j(i).
     @param[in]
     strideP   rocsolver_int.\n
-              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). 
+              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1).
               There is no restriction for the value
               of strideP. Normal use is strideP >= min(m,n).
     @param[in]
@@ -1582,24 +1582,24 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle hand
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2_strided_batched(rocsolver_handle handle, 
-                                                                 const rocsolver_int m, 
-                                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2_strided_batched(rocsolver_handle handle,
+                                                                 const rocsolver_int m,
+                                                                 const rocsolver_int n,
                                                                  float *A,
-                                                                 const rocsolver_int lda, 
-                                                                 const rocsolver_int strideA, 
-                                                                 float *ipiv, 
-                                                                 const rocsolver_int strideP, 
+                                                                 const rocsolver_int lda,
+                                                                 const rocsolver_int strideA,
+                                                                 float *ipiv,
+                                                                 const rocsolver_int strideP,
                                                                  const rocsolver_int batch_count);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_strided_batched(rocsolver_handle handle, 
-                                                                 const rocsolver_int m, 
-                                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_strided_batched(rocsolver_handle handle,
+                                                                 const rocsolver_int m,
+                                                                 const rocsolver_int n,
                                                                  double *A,
-                                                                 const rocsolver_int lda, 
-                                                                 const rocsolver_int strideA, 
-                                                                 double *ipiv, 
-                                                                 const rocsolver_int strideP, 
+                                                                 const rocsolver_int lda,
+                                                                 const rocsolver_int strideA,
+                                                                 double *ipiv,
+                                                                 const rocsolver_int strideP,
                                                                  const rocsolver_int batch_count);
 
 /*! \brief GELQ2 computes a LQ factorization of a general m-by-n matrix A.
@@ -1610,8 +1610,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_strided_batched(rocsolver_han
     The factorization has the form
 
         A = [ L 0 ] * Q
- 
-    where L is lower triangular (lower trapezoidal if m > n), and Q is 
+
+    where L is lower triangular (lower trapezoidal if m > n), and Q is
     a n-by-n orthogonal matrix represented as the product of Householder matrices
 
         Q = H(k) * H(k-1) * ... * H(1), with k = min(m,n)
@@ -1619,8 +1619,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_strided_batched(rocsolver_han
     Each Householder matrix H(i), for i = 1,2,...,k, is given by
 
         H(i) = I - ipiv[i-1] * v(i)' * v(i)
-    
-    where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. 
+
+    where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -1633,30 +1633,30 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_strided_batched(rocsolver_han
     @param[inout]
     A         pointer to type. Array on the GPU of dimension lda*n.\n
               On entry, the m-by-n matrix to be factored.
-              On exit, the elements on and delow the diagonal contain the 
+              On exit, the elements on and delow the diagonal contain the
               factor L; the elements above the diagonal are the n - i elements
               of vector v(i) for i = 1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of A. 
+              Specifies the leading dimension of A.
     @param[out]
     ipiv      pointer to type. Array on the GPU of dimension min(m,n).\n
               The scalar factors of the Householder matrices H(i).
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2(rocsolver_handle handle, 
-                                                 const rocsolver_int m, 
-                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2(rocsolver_handle handle,
+                                                 const rocsolver_int m,
+                                                 const rocsolver_int n,
                                                  float *A,
-                                                 const rocsolver_int lda, 
+                                                 const rocsolver_int lda,
                                                  float *ipiv);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle, 
-                                                 const rocsolver_int m, 
-                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle,
+                                                 const rocsolver_int m,
+                                                 const rocsolver_int n,
                                                  double *A,
-                                                 const rocsolver_int lda, 
+                                                 const rocsolver_int lda,
                                                  double *ipiv);
 
 /*! \brief GELQ2_BATCHED computes the LQ factorization of a batch of general m-by-n matrices.
@@ -1666,9 +1666,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle,
 
     The factorization of matrix A_j in the batch has the form
 
-        A_j = [ L_j 0 ] * Q_j 
+        A_j = [ L_j 0 ] * Q_j
 
-    where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is 
+    where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is
     a n-by-n orthogonal matrix represented as the product of Householder matrices
 
         Q_j = H_j(k) * H_j(k-1) * ... * H_j(1), with k = min(m,n)
@@ -1677,7 +1677,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle,
 
         H_j(i) = I - ipiv_j[i-1] * v_j(i)' * v_j(i)
 
-    where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. 
+    where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -1690,19 +1690,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle,
     @param[inout]
     A         Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n
               On entry, the m-by-n matrices A_j to be factored.
-              On exit, the elements on and below the diagonal contain the 
+              On exit, the elements on and below the diagonal contain the
               factor L_j. The elements above the diagonal are the n - i elements
               of vector v_j(i) for i=1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of matrices A_j. 
+              Specifies the leading dimension of matrices A_j.
     @param[out]
     ipiv      pointer to type. Array on the GPU (the size depends on the value of strideP).\n
-              Contains the vectors ipiv_j of scalar factors of the 
+              Contains the vectors ipiv_j of scalar factors of the
               Householder matrices H_j(i).
     @param[in]
     strideP   rocsolver_int.\n
-              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). 
+              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1).
               There is no restriction for the value
               of strideP. Normal use is strideP >= min(m,n).
     @param[in]
@@ -1711,22 +1711,22 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle,
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2_batched(rocsolver_handle handle, 
-                                                         const rocsolver_int m, 
-                                                         const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2_batched(rocsolver_handle handle,
+                                                         const rocsolver_int m,
+                                                         const rocsolver_int n,
                                                          float *const A[],
-                                                         const rocsolver_int lda, 
-                                                         float *ipiv, 
-                                                         const rocsolver_int strideP, 
+                                                         const rocsolver_int lda,
+                                                         float *ipiv,
+                                                         const rocsolver_int strideP,
                                                          const rocsolver_int batch_count);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle handle, 
-                                                         const rocsolver_int m, 
-                                                         const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle handle,
+                                                         const rocsolver_int m,
+                                                         const rocsolver_int n,
                                                          double *const A[],
-                                                         const rocsolver_int lda, 
-                                                         double *ipiv, 
-                                                         const rocsolver_int strideP, 
+                                                         const rocsolver_int lda,
+                                                         double *ipiv,
+                                                         const rocsolver_int strideP,
                                                          const rocsolver_int batch_count);
 
 /*! \brief GELQ2_STRIDED_BATCHED computes the LQ factorization of a batch of general m-by-n matrices.
@@ -1736,9 +1736,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle hand
 
     The factorization of matrix A_j in the batch has the form
 
-        A_j = [ L_j 0 ] * Q_j 
+        A_j = [ L_j 0 ] * Q_j
 
-    where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is 
+    where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is
     a n-by-n orthogonal matrix represented as the product of Householder matrices
 
         Q_j = H_j(k) * H_j(k-1) * ... * H_j(1), with k = min(m,n)
@@ -1747,7 +1747,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle hand
 
         H_j(i) = I - ipiv_j[i-1] * v_j(i)' * v_j(i)
 
-    where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. 
+    where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -1760,23 +1760,23 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle hand
     @param[inout]
     A         pointer to type. Array on the GPU (the size depends on the value of strideA).\n
               On entry, the m-by-n matrices A_j to be factored.
-              On exit, the elements on and below the diagonal contain the 
+              On exit, the elements on and below the diagonal contain the
               factor L_j. The elements above the diagonal are the n - i elements
               of vector v_j(i) for i = 1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of matrices A_j. 
+              Specifies the leading dimension of matrices A_j.
     @param[in]
-    strideA   rocsolver_int.\n   
-              Stride from the start of one matrix A_j and the next one A_(j+1). 
+    strideA   rocsolver_int.\n
+              Stride from the start of one matrix A_j and the next one A_(j+1).
               There is no restriction for the value of strideA. Normal use case is strideA >= lda*n.
     @param[out]
     ipiv      pointer to type. Array on the GPU (the size depends on the value of strideP).\n
-              Contains the vectors ipiv_j of scalar factors of the 
+              Contains the vectors ipiv_j of scalar factors of the
               Householder matrices H_j(i).
     @param[in]
     strideP   rocsolver_int.\n
-              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). 
+              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1).
               There is no restriction for the value
               of strideP. Normal use is strideP >= min(m,n).
     @param[in]
@@ -1785,24 +1785,24 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle hand
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2_strided_batched(rocsolver_handle handle, 
-                                                                 const rocsolver_int m, 
-                                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2_strided_batched(rocsolver_handle handle,
+                                                                 const rocsolver_int m,
+                                                                 const rocsolver_int n,
                                                                  float *A,
-                                                                 const rocsolver_int lda, 
-                                                                 const rocsolver_int strideA, 
-                                                                 float *ipiv, 
-                                                                 const rocsolver_int strideP, 
+                                                                 const rocsolver_int lda,
+                                                                 const rocsolver_int strideA,
+                                                                 float *ipiv,
+                                                                 const rocsolver_int strideP,
                                                                  const rocsolver_int batch_count);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_strided_batched(rocsolver_handle handle, 
-                                                                 const rocsolver_int m, 
-                                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_strided_batched(rocsolver_handle handle,
+                                                                 const rocsolver_int m,
+                                                                 const rocsolver_int n,
                                                                  double *A,
-                                                                 const rocsolver_int lda, 
-                                                                 const rocsolver_int strideA, 
-                                                                 double *ipiv, 
-                                                                 const rocsolver_int strideP, 
+                                                                 const rocsolver_int lda,
+                                                                 const rocsolver_int strideA,
+                                                                 double *ipiv,
+                                                                 const rocsolver_int strideP,
                                                                  const rocsolver_int batch_count);
 
 
@@ -1815,8 +1815,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_strided_batched(rocsolver_han
 
         A =  Q * [ R ]
                  [ 0 ]
- 
-    where R is upper triangular (upper trapezoidal if m < n), and Q is 
+
+    where R is upper triangular (upper trapezoidal if m < n), and Q is
     a m-by-m orthogonal matrix represented as the product of Householder matrices
 
         Q = H(1) * H(2) * ... * H(k), with k = min(m,n)
@@ -1824,8 +1824,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_strided_batched(rocsolver_han
     Each Householder matrix H(i), for i = 1,2,...,k, is given by
 
         H(i) = I - ipiv[i-1] * v(i) * v(i)'
-    
-    where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. 
+
+    where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -1838,30 +1838,30 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_strided_batched(rocsolver_han
     @param[inout]
     A         pointer to type. Array on the GPU of dimension lda*n.\n
               On entry, the m-by-n matrix to be factored.
-              On exit, the elements on and above the diagonal contain the 
+              On exit, the elements on and above the diagonal contain the
               factor R; the elements below the diagonal are the m - i elements
               of vector v(i) for i = 1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of A. 
+              Specifies the leading dimension of A.
     @param[out]
     ipiv      pointer to type. Array on the GPU of dimension min(m,n).\n
               The scalar factors of the Householder matrices H(i).
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf(rocsolver_handle handle, 
-                                                 const rocsolver_int m, 
-                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf(rocsolver_handle handle,
+                                                 const rocsolver_int m,
+                                                 const rocsolver_int n,
                                                  float *A,
-                                                 const rocsolver_int lda, 
+                                                 const rocsolver_int lda,
                                                  float *ipiv);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle, 
-                                                 const rocsolver_int m, 
-                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle,
+                                                 const rocsolver_int m,
+                                                 const rocsolver_int n,
                                                  double *A,
-                                                 const rocsolver_int lda, 
+                                                 const rocsolver_int lda,
                                                  double *ipiv);
 
 /*! \brief GEQRF_BATCHED computes the QR factorization of a batch of general m-by-n matrices.
@@ -1872,9 +1872,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle,
     The factorization of matrix A_j in the batch has the form
 
         A_j =  Q_j * [ R_j ]
-                     [  0  ] 
+                     [  0  ]
 
-    where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is 
+    where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is
     a m-by-m orthogonal matrix represented as the product of Householder matrices
 
         Q_j = H_j(1) * H_j(2) * ... * H_j(k), with k = min(m,n)
@@ -1883,7 +1883,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle,
 
         H_j(i) = I - ipiv_j[i-1] * v_j(i) * v_j(i)'
 
-    where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. 
+    where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -1896,19 +1896,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle,
     @param[inout]
     A         Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n
               On entry, the m-by-n matrices A_j to be factored.
-              On exit, the elements on and above the diagonal contain the 
+              On exit, the elements on and above the diagonal contain the
               factor R_j. The elements below the diagonal are the m - i elements
               of vector v_j(i) for i=1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of matrices A_j. 
+              Specifies the leading dimension of matrices A_j.
     @param[out]
     ipiv      pointer to type. Array on the GPU (the size depends on the value of strideP).\n
-              Contains the vectors ipiv_j of scalar factors of the 
+              Contains the vectors ipiv_j of scalar factors of the
               Householder matrices H_j(i).
     @param[in]
     strideP   rocsolver_int.\n
-              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). 
+              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1).
               There is no restriction for the value
               of strideP. Normal use is strideP >= min(m,n).
     @param[in]
@@ -1917,22 +1917,22 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle,
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf_batched(rocsolver_handle handle, 
-                                                         const rocsolver_int m, 
-                                                         const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf_batched(rocsolver_handle handle,
+                                                         const rocsolver_int m,
+                                                         const rocsolver_int n,
                                                          float *const A[],
-                                                         const rocsolver_int lda, 
-                                                         float *ipiv, 
-                                                         const rocsolver_int strideP, 
+                                                         const rocsolver_int lda,
+                                                         float *ipiv,
+                                                         const rocsolver_int strideP,
                                                          const rocsolver_int batch_count);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle handle, 
-                                                         const rocsolver_int m, 
-                                                         const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle handle,
+                                                         const rocsolver_int m,
+                                                         const rocsolver_int n,
                                                          double *const A[],
-                                                         const rocsolver_int lda, 
-                                                         double *ipiv, 
-                                                         const rocsolver_int strideP, 
+                                                         const rocsolver_int lda,
+                                                         double *ipiv,
+                                                         const rocsolver_int strideP,
                                                          const rocsolver_int batch_count);
 
 /*! \brief GEQRF_STRIDED_BATCHED computes the QR factorization of a batch of general m-by-n matrices.
@@ -1943,9 +1943,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle hand
     The factorization of matrix A_j in the batch has the form
 
         A_j =  Q_j * [ R_j ]
-                     [  0  ] 
+                     [  0  ]
 
-    where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is 
+    where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is
     a m-by-m orthogonal matrix represented as the product of Householder matrices
 
         Q_j = H_j(1) * H_j(2) * ... * H_j(k), with k = min(m,n)
@@ -1954,7 +1954,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle hand
 
         H_j(i) = I - ipiv_j[i-1] * v_j(i) * v_j(i)'
 
-    where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. 
+    where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -1967,23 +1967,23 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle hand
     @param[inout]
     A         pointer to type. Array on the GPU (the size depends on the value of strideA).\n
               On entry, the m-by-n matrices A_j to be factored.
-              On exit, the elements on and above the diagonal contain the 
+              On exit, the elements on and above the diagonal contain the
               factor R_j. The elements below the diagonal are the m - i elements
               of vector v_j(i) for i = 1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of matrices A_j. 
+              Specifies the leading dimension of matrices A_j.
     @param[in]
-    strideA   rocsolver_int.\n   
-              Stride from the start of one matrix A_j and the next one A_(j+1). 
+    strideA   rocsolver_int.\n
+              Stride from the start of one matrix A_j and the next one A_(j+1).
               There is no restriction for the value of strideA. Normal use case is strideA >= lda*n.
     @param[out]
     ipiv      pointer to type. Array on the GPU (the size depends on the value of strideP).\n
-              Contains the vectors ipiv_j of scalar factors of the 
+              Contains the vectors ipiv_j of scalar factors of the
               Householder matrices H_j(i).
     @param[in]
     strideP   rocsolver_int.\n
-              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). 
+              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1).
               There is no restriction for the value
               of strideP. Normal use is strideP >= min(m,n).
     @param[in]
@@ -1992,24 +1992,24 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle hand
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf_strided_batched(rocsolver_handle handle, 
-                                                                 const rocsolver_int m, 
-                                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf_strided_batched(rocsolver_handle handle,
+                                                                 const rocsolver_int m,
+                                                                 const rocsolver_int n,
                                                                  float *A,
-                                                                 const rocsolver_int lda, 
-                                                                 const rocsolver_int strideA, 
-                                                                 float *ipiv, 
-                                                                 const rocsolver_int strideP, 
+                                                                 const rocsolver_int lda,
+                                                                 const rocsolver_int strideA,
+                                                                 float *ipiv,
+                                                                 const rocsolver_int strideP,
                                                                  const rocsolver_int batch_count);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_strided_batched(rocsolver_handle handle, 
-                                                                 const rocsolver_int m, 
-                                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_strided_batched(rocsolver_handle handle,
+                                                                 const rocsolver_int m,
+                                                                 const rocsolver_int n,
                                                                  double *A,
-                                                                 const rocsolver_int lda, 
-                                                                 const rocsolver_int strideA, 
-                                                                 double *ipiv, 
-                                                                 const rocsolver_int strideP, 
+                                                                 const rocsolver_int lda,
+                                                                 const rocsolver_int strideA,
+                                                                 double *ipiv,
+                                                                 const rocsolver_int strideP,
                                                                  const rocsolver_int batch_count);
 
 /*! \brief GELQF computes a LQ factorization of a general m-by-n matrix A.
@@ -2020,8 +2020,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_strided_batched(rocsolver_han
     The factorization has the form
 
         A = [ L 0 ] * Q
- 
-    where L is lower triangular (lower trapezoidal if m > n), and Q is 
+
+    where L is lower triangular (lower trapezoidal if m > n), and Q is
     a n-by-n orthogonal matrix represented as the product of Householder matrices
 
         Q = H(k) * H(k-1) * ... * H(1), with k = min(m,n)
@@ -2029,8 +2029,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_strided_batched(rocsolver_han
     Each Householder matrix H(i), for i = 1,2,...,k, is given by
 
         H(i) = I - ipiv[i-1] * v(i)' * v(i)
-    
-    where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. 
+
+    where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -2043,30 +2043,30 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_strided_batched(rocsolver_han
     @param[inout]
     A         pointer to type. Array on the GPU of dimension lda*n.\n
               On entry, the m-by-n matrix to be factored.
-              On exit, the elements on and delow the diagonal contain the 
+              On exit, the elements on and delow the diagonal contain the
               factor L; the elements above the diagonal are the n - i elements
               of vector v(i) for i = 1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of A. 
+              Specifies the leading dimension of A.
     @param[out]
     ipiv      pointer to type. Array on the GPU of dimension min(m,n).\n
               The scalar factors of the Householder matrices H(i).
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf(rocsolver_handle handle, 
-                                                 const rocsolver_int m, 
-                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf(rocsolver_handle handle,
+                                                 const rocsolver_int m,
+                                                 const rocsolver_int n,
                                                  float *A,
-                                                 const rocsolver_int lda, 
+                                                 const rocsolver_int lda,
                                                  float *ipiv);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle, 
-                                                 const rocsolver_int m, 
-                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle,
+                                                 const rocsolver_int m,
+                                                 const rocsolver_int n,
                                                  double *A,
-                                                 const rocsolver_int lda, 
+                                                 const rocsolver_int lda,
                                                  double *ipiv);
 
 /*! \brief GELQF_BATCHED computes the LQ factorization of a batch of general m-by-n matrices.
@@ -2076,9 +2076,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle,
 
     The factorization of matrix A_j in the batch has the form
 
-        A_j = [ L_j 0 ] * Q_j 
+        A_j = [ L_j 0 ] * Q_j
 
-    where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is 
+    where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is
     a n-by-n orthogonal matrix represented as the product of Householder matrices
 
         Q_j = H_j(k) * H_j(k-1) * ... * H_j(1), with k = min(m,n)
@@ -2087,7 +2087,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle,
 
         H_j(i) = I - ipiv_j[i-1] * v_j(i)' * v_j(i)
 
-    where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. 
+    where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -2100,19 +2100,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle,
     @param[inout]
     A         Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n
               On entry, the m-by-n matrices A_j to be factored.
-              On exit, the elements on and below the diagonal contain the 
+              On exit, the elements on and below the diagonal contain the
               factor L_j. The elements above the diagonal are the n - i elements
               of vector v_j(i) for i=1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of matrices A_j. 
+              Specifies the leading dimension of matrices A_j.
     @param[out]
     ipiv      pointer to type. Array on the GPU (the size depends on the value of strideP).\n
-              Contains the vectors ipiv_j of scalar factors of the 
+              Contains the vectors ipiv_j of scalar factors of the
               Householder matrices H_j(i).
     @param[in]
     strideP   rocsolver_int.\n
-              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). 
+              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1).
               There is no restriction for the value
               of strideP. Normal use is strideP >= min(m,n).
     @param[in]
@@ -2121,22 +2121,22 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle,
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf_batched(rocsolver_handle handle, 
-                                                         const rocsolver_int m, 
-                                                         const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf_batched(rocsolver_handle handle,
+                                                         const rocsolver_int m,
+                                                         const rocsolver_int n,
                                                          float *const A[],
-                                                         const rocsolver_int lda, 
-                                                         float *ipiv, 
-                                                         const rocsolver_int strideP, 
+                                                         const rocsolver_int lda,
+                                                         float *ipiv,
+                                                         const rocsolver_int strideP,
                                                          const rocsolver_int batch_count);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle handle, 
-                                                         const rocsolver_int m, 
-                                                         const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle handle,
+                                                         const rocsolver_int m,
+                                                         const rocsolver_int n,
                                                          double *const A[],
-                                                         const rocsolver_int lda, 
-                                                         double *ipiv, 
-                                                         const rocsolver_int strideP, 
+                                                         const rocsolver_int lda,
+                                                         double *ipiv,
+                                                         const rocsolver_int strideP,
                                                          const rocsolver_int batch_count);
 
 /*! \brief GELQF_STRIDED_BATCHED computes the LQ factorization of a batch of general m-by-n matrices.
@@ -2146,9 +2146,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle hand
 
     The factorization of matrix A_j in the batch has the form
 
-        A_j = [ L_j 0 ] * Q_j 
+        A_j = [ L_j 0 ] * Q_j
 
-    where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is 
+    where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is
     a n-by-n orthogonal matrix represented as the product of Householder matrices
 
         Q_j = H_j(k) * H_j(k-1) * ... * H_j(1), with k = min(m,n)
@@ -2157,7 +2157,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle hand
 
         H_j(i) = I - ipiv_j[i-1] * v_j(i)' * v_j(i)
 
-    where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. 
+    where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -2170,23 +2170,23 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle hand
     @param[inout]
     A         pointer to type. Array on the GPU (the size depends on the value of strideA).\n
               On entry, the m-by-n matrices A_j to be factored.
-              On exit, the elements on and below the diagonal contain the 
+              On exit, the elements on and below the diagonal contain the
               factor L_j. The elements above the diagonal are the n - i elements
               of vector v_j(i) for i = 1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of matrices A_j. 
+              Specifies the leading dimension of matrices A_j.
     @param[in]
-    strideA   rocsolver_int.\n   
-              Stride from the start of one matrix A_j and the next one A_(j+1). 
+    strideA   rocsolver_int.\n
+              Stride from the start of one matrix A_j and the next one A_(j+1).
               There is no restriction for the value of strideA. Normal use case is strideA >= lda*n.
     @param[out]
     ipiv      pointer to type. Array on the GPU (the size depends on the value of strideP).\n
-              Contains the vectors ipiv_j of scalar factors of the 
+              Contains the vectors ipiv_j of scalar factors of the
               Householder matrices H_j(i).
     @param[in]
     strideP   rocsolver_int.\n
-              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). 
+              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1).
               There is no restriction for the value
               of strideP. Normal use is strideP >= min(m,n).
     @param[in]
@@ -2195,46 +2195,46 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle hand
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf_strided_batched(rocsolver_handle handle, 
-                                                                 const rocsolver_int m, 
-                                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf_strided_batched(rocsolver_handle handle,
+                                                                 const rocsolver_int m,
+                                                                 const rocsolver_int n,
                                                                  float *A,
-                                                                 const rocsolver_int lda, 
-                                                                 const rocsolver_int strideA, 
-                                                                 float *ipiv, 
-                                                                 const rocsolver_int strideP, 
+                                                                 const rocsolver_int lda,
+                                                                 const rocsolver_int strideA,
+                                                                 float *ipiv,
+                                                                 const rocsolver_int strideP,
                                                                  const rocsolver_int batch_count);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_strided_batched(rocsolver_handle handle, 
-                                                                 const rocsolver_int m, 
-                                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_strided_batched(rocsolver_handle handle,
+                                                                 const rocsolver_int m,
+                                                                 const rocsolver_int n,
                                                                  double *A,
-                                                                 const rocsolver_int lda, 
-                                                                 const rocsolver_int strideA, 
-                                                                 double *ipiv, 
-                                                                 const rocsolver_int strideP, 
+                                                                 const rocsolver_int lda,
+                                                                 const rocsolver_int strideA,
+                                                                 double *ipiv,
+                                                                 const rocsolver_int strideP,
                                                                  const rocsolver_int batch_count);
 
 
 /*! \brief GETRS solves a system of n linear equations on n variables using the LU factorization computed by GETRF.
 
     \details
-    It solves one of the following systems: 
+    It solves one of the following systems:
 
-        A  * X = B (no transpose),  
-        A' * X = B (transpose),  or  
+        A  * X = B (no transpose),
+        A' * X = B (transpose),  or
         A* * X = B (conjugate transpose)
 
-    depending on the value of trans. 
+    depending on the value of trans.
 
     @param[in]
     handle      rocsolver_handle.
     @param[in]
     trans       rocsolver_operation.\n
-                Specifies the form of the system of equations. 
+                Specifies the form of the system of equations.
     @param[in]
     n           rocsolver_int. n >= 0.\n
-                The order of the system, i.e. the number of columns and rows of A.  
+                The order of the system, i.e. the number of columns and rows of A.
     @param[in]
     nrhs        rocsolver_int. nrhs >= 0.\n
                 The number of right hand sides, i.e., the number of columns
@@ -2244,7 +2244,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_strided_batched(rocsolver_han
                 The factors L and U of the factorization A = P*L*U returned by GETRF.
     @param[in]
     lda         rocsolver_int. lda >= n.\n
-                The leading dimension of A.  
+                The leading dimension of A.
     @param[in]
     ipiv        pointer to rocsolver_int. Array on the GPU of dimension n.\n
                 The pivot indices returned by GETRF.
@@ -2278,26 +2278,26 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs(
     const rocsolver_int nrhs, rocblas_double_complex *A, const rocsolver_int lda,
     const rocsolver_int *ipiv, rocblas_double_complex *B, const rocsolver_int ldb);
 
-/*! \brief GETRS_BATCHED solves a batch of systems of n linear equations on n variables 
+/*! \brief GETRS_BATCHED solves a batch of systems of n linear equations on n variables
      using the LU factorization computed by GETRF_BATCHED.
 
     \details
-    For each instance j in the batch, it solves one of the following systems: 
+    For each instance j in the batch, it solves one of the following systems:
 
-        A_j  * X_j = B_j (no transpose),  
-        A_j' * X_j = B_j (transpose),  or  
+        A_j  * X_j = B_j (no transpose),
+        A_j' * X_j = B_j (transpose),  or
         A_j* * X_j = B_j (conjugate transpose)
 
-    depending on the value of trans. 
+    depending on the value of trans.
 
     @param[in]
     handle      rocsolver_handle.
     @param[in]
     trans       rocsolver_operation.\n
-                Specifies the form of the system of equations of each instance in the batch. 
+                Specifies the form of the system of equations of each instance in the batch.
     @param[in]
     n           rocsolver_int. n >= 0.\n
-                The order of the system, i.e. the number of columns and rows of all A_j matrices.  
+                The order of the system, i.e. the number of columns and rows of all A_j matrices.
     @param[in]
     nrhs        rocsolver_int. nrhs >= 0.\n
                 The number of right hand sides, i.e., the number of columns
@@ -2312,7 +2312,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs(
     ipiv        pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n
                 Contains the vectors ipiv_j of pivot indices returned by GETRF_BATCHED.
     @param[in,out]
-    B           Array of pointers to type. Each pointer points to an array on the GPU of dimension ldb*nrhs.\n 
+    B           Array of pointers to type. Each pointer points to an array on the GPU of dimension ldb*nrhs.\n
                 On entry, the right hand side matrices B_j.
                 On exit, the solution matrix X_j of each system in the batch.
     @param[in]
@@ -2320,7 +2320,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs(
                 The leading dimension of matrices B_j.
     @param[in]
     batch_count rocsolver_int. batch_count >= 0.\n
-                Number of instances (systems) in the batch. 
+                Number of instances (systems) in the batch.
 
    ********************************************************************/
 
@@ -2337,35 +2337,35 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrs_batched(
 ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrs_batched(
                  rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, rocblas_float_complex *const A[], const rocblas_int lda,
-                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *const B[], 
+                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *const B[],
                  const rocblas_int ldb, const rocblas_int batch_count);
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_batched(
                  rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, rocblas_double_complex *const A[], const rocblas_int lda,
-                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *const B[], 
+                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *const B[],
                  const rocblas_int ldb, const rocblas_int batch_count);
 
-/*! \brief GETRS_STRIDED_BATCHED solves a batch of systems of n linear equations on n variables 
+/*! \brief GETRS_STRIDED_BATCHED solves a batch of systems of n linear equations on n variables
      using the LU factorization computed by GETRF_STRIDED_BATCHED.
 
     \details
-    For each instance j in the batch, it solves one of the following systems: 
+    For each instance j in the batch, it solves one of the following systems:
 
-        A_j  * X_j = B_j (no transpose),  
-        A_j' * X_j = B_j (transpose),  or  
+        A_j  * X_j = B_j (no transpose),
+        A_j' * X_j = B_j (transpose),  or
         A_j* * X_j = B_j (conjugate transpose)
 
-    depending on the value of trans. 
+    depending on the value of trans.
 
     @param[in]
     handle      rocsolver_handle.
     @param[in]
     trans       rocsolver_operation.\n
-                Specifies the form of the system of equations of each instance in the batch. 
+                Specifies the form of the system of equations of each instance in the batch.
     @param[in]
     n           rocsolver_int. n >= 0.\n
-                The order of the system, i.e. the number of columns and rows of all A_j matrices.  
+                The order of the system, i.e. the number of columns and rows of all A_j matrices.
     @param[in]
     nrhs        rocsolver_int. nrhs >= 0.\n
                 The number of right hand sides, i.e., the number of columns
@@ -2378,7 +2378,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_batched(
                 The leading dimension of matrices A_j.
     @param[in]
     strideA     rocsolver_int.\n
-                Stride from the start of one matrix A_j and the next one A_(j+1). 
+                Stride from the start of one matrix A_j and the next one A_(j+1).
                 There is no restriction for the value of strideA. Normal use case is strideA >= lda*n.
     @param[in]
     ipiv        pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n
@@ -2392,11 +2392,11 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_batched(
                 The leading dimension of matrices B_j.
     @param[in]
     strideB     rocsolver_int.\n
-                Stride from the start of one matrix B_j and the next one B_(j+1). 
+                Stride from the start of one matrix B_j and the next one B_(j+1).
                 There is no restriction for the value of strideB. Normal use case is strideB >= ldb*nrhs.
     @param[in]
     batch_count rocsolver_int. batch_count >= 0.\n
-                Number of instances (systems) in the batch. 
+                Number of instances (systems) in the batch.
 
    ********************************************************************/
 
@@ -2413,13 +2413,13 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrs_strided_batched(
 ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrs_strided_batched(
                  rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, rocblas_float_complex *A, const rocblas_int lda, const rocblas_int strideA,
-                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *B, const rocblas_int ldb, 
+                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *B, const rocblas_int ldb,
                  const rocblas_int strideB, const rocblas_int batch_count);
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_strided_batched(
                  rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, rocblas_double_complex *A, const rocblas_int lda, const rocblas_int strideA,
-                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *B, const rocblas_int ldb, 
+                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *B, const rocblas_int ldb,
                  const rocblas_int strideB, const rocblas_int batch_count);
 
 
@@ -2427,7 +2427,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_strided_batched(
     positive definite matrix A.
 
     \details
-    (This is the unblocked version of the algorithm). 
+    (This is the unblocked version of the algorithm).
 
     The factorization has the form:
 
@@ -2453,8 +2453,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_strided_batched(
               specifies the leading dimension of A.
     @param[out]
     info      pointer to a rocsolver_int on the GPU.\n
-              If info = 0, succesful factorization of matrix A. 
-              If info = i > 0, the leading minor of order i of A is not positive definite. 
+              If info = 0, succesful factorization of matrix A.
+              If info = i > 0, the leading minor of order i of A is not positive definite.
               The factorization stopped at this point.
 
     ********************************************************************/
@@ -2472,11 +2472,11 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2(rocsolver_handle handle,
                                                    rocblas_int* info);
 
 
-/*! \brief POTF2_BATCHED computes the Cholesky factorization of a 
+/*! \brief POTF2_BATCHED computes the Cholesky factorization of a
     batch of real symmetric positive definite matrices.
 
     \details
-    (This is the unblocked version of the algorithm). 
+    (This is the unblocked version of the algorithm).
 
     The factorization of matrix A_i in the batch has the form:
 
@@ -2496,24 +2496,24 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2(rocsolver_handle handle,
               The dimension of matrix A_i.
     @param[inout]
     A         array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n
-              On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. 
+              On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors.
     @param[in]
     lda       rocsolver_int. lda >= n.\n
               specifies the leading dimension of A_i.
     @param[out]
     info      pointer to rocsolver_int. Array of batch_count integers on the GPU.\n
-              If info_i = 0, succesful factorization of matrix A_i. 
-              If info_i = j > 0, the leading minor of order j of A_i is not positive definite. 
+              If info_i = 0, succesful factorization of matrix A_i.
+              If info_i = j > 0, the leading minor of order j of A_i is not positive definite.
               The i-th factorization stopped at this point.
     @param[in]
     batch_count rocsolver_int. batch_count >= 0.\n
-                Number of matrices in the batch. 
+                Number of matrices in the batch.
 
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_spotf2_batched(rocsolver_handle handle,
                                                            const rocsolver_fill uplo,
-                                                           const rocsolver_int n, 
+                                                           const rocsolver_int n,
                                                            float *const A[],
                                                            const rocsolver_int lda,
                                                            rocblas_int* info,
@@ -2521,17 +2521,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotf2_batched(rocsolver_handle hand
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2_batched(rocsolver_handle handle,
                                                            const rocsolver_fill uplo,
-                                                           const rocsolver_int n, 
+                                                           const rocsolver_int n,
                                                            double *const A[],
                                                            const rocsolver_int lda,
                                                            rocblas_int* info,
                                                            const rocsolver_int batch_count);
 
-/*! \brief POTF2_STRIDED_BATCHED computes the Cholesky factorization of a 
+/*! \brief POTF2_STRIDED_BATCHED computes the Cholesky factorization of a
     batch of real symmetric positive definite matrices.
 
     \details
-    (This is the unblocked version of the algorithm). 
+    (This is the unblocked version of the algorithm).
 
     The factorization of matrix A_i in the batch has the form:
 
@@ -2551,28 +2551,28 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2_batched(rocsolver_handle hand
               The dimension of matrix A_i.
     @param[inout]
     A         pointer to type. Array on the GPU (the size depends on the value of strideA).\n
-              On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. 
+              On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors.
     @param[in]
     lda       rocsolver_int. lda >= n.\n
               specifies the leading dimension of A_i.
     @param[in]
     strideA   rocsolver_int.\n
-              Stride from the start of one matrix A_i and the next one A_(i+1). 
+              Stride from the start of one matrix A_i and the next one A_(i+1).
               There is no restriction for the value of strideA. Normal use case is strideA >= lda*n.
     @param[out]
     info      pointer to rocsolver_int. Array of batch_count integers on the GPU.\n
-              If info_i = 0, succesful factorization of matrix A_i. 
-              If info_i = j > 0, the leading minor of order j of A_i is not positive definite. 
+              If info_i = 0, succesful factorization of matrix A_i.
+              If info_i = j > 0, the leading minor of order j of A_i is not positive definite.
               The i-th factorization stopped at this point.
     @param[in]
     batch_count rocsolver_int. batch_count >= 0.\n
-                Number of matrices in the batch. 
+                Number of matrices in the batch.
 
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_spotf2_strided_batched(rocsolver_handle handle,
                                                                    const rocsolver_fill uplo,
-                                                                   const rocsolver_int n, 
+                                                                   const rocsolver_int n,
                                                                    float *A,
                                                                    const rocsolver_int lda,
                                                                    const rocsolver_int strideA,
@@ -2581,7 +2581,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotf2_strided_batched(rocsolver_han
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2_strided_batched(rocsolver_handle handle,
                                                                    const rocsolver_fill uplo,
-                                                                   const rocsolver_int n, 
+                                                                   const rocsolver_int n,
                                                                    double *A,
                                                                    const rocsolver_int lda,
                                                                    const rocsolver_int strideA,
@@ -2592,7 +2592,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2_strided_batched(rocsolver_han
     positive definite matrix A.
 
     \details
-    (This is the blocked version of the algorithm). 
+    (This is the blocked version of the algorithm).
 
     The factorization has the form:
 
@@ -2618,8 +2618,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2_strided_batched(rocsolver_han
               specifies the leading dimension of A.
     @param[out]
     info      pointer to a rocsolver_int on the GPU.\n
-              If info = 0, succesful factorization of matrix A. 
-              If info = i > 0, the leading minor of order i of A is not positive definite. 
+              If info = 0, succesful factorization of matrix A.
+              If info = i > 0, the leading minor of order i of A is not positive definite.
               The factorization stopped at this point.
 
     ********************************************************************/
@@ -2637,11 +2637,11 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotrf(rocsolver_handle handle,
                                                    rocblas_int* info);
 
 
-/*! \brief POTRF_BATCHED computes the Cholesky factorization of a 
+/*! \brief POTRF_BATCHED computes the Cholesky factorization of a
     batch of real symmetric positive definite matrices.
 
     \details
-    (This is the blocked version of the algorithm). 
+    (This is the blocked version of the algorithm).
 
     The factorization of matrix A_i in the batch has the form:
 
@@ -2661,24 +2661,24 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotrf(rocsolver_handle handle,
               The dimension of matrix A_i.
     @param[inout]
     A         array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n
-              On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. 
+              On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors.
     @param[in]
     lda       rocsolver_int. lda >= n.\n
               specifies the leading dimension of A_i.
     @param[out]
     info      pointer to rocsolver_int. Array of batch_count integers on the GPU.\n
-              If info_i = 0, succesful factorization of matrix A_i. 
-              If info_i = j > 0, the leading minor of order j of A_i is not positive definite. 
+              If info_i = 0, succesful factorization of matrix A_i.
+              If info_i = j > 0, the leading minor of order j of A_i is not positive definite.
               The i-th factorization stopped at this point.
     @param[in]
     batch_count rocsolver_int. batch_count >= 0.\n
-                Number of matrices in the batch. 
+                Number of matrices in the batch.
 
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_spotrf_batched(rocsolver_handle handle,
                                                            const rocsolver_fill uplo,
-                                                           const rocsolver_int n, 
+                                                           const rocsolver_int n,
                                                            float *const A[],
                                                            const rocsolver_int lda,
                                                            rocblas_int* info,
@@ -2686,17 +2686,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotrf_batched(rocsolver_handle hand
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotrf_batched(rocsolver_handle handle,
                                                            const rocsolver_fill uplo,
-                                                           const rocsolver_int n, 
+                                                           const rocsolver_int n,
                                                            double *const A[],
                                                            const rocsolver_int lda,
                                                            rocblas_int* info,
                                                            const rocsolver_int batch_count);
 
-/*! \brief POTRF_STRIDED_BATCHED computes the Cholesky factorization of a 
+/*! \brief POTRF_STRIDED_BATCHED computes the Cholesky factorization of a
     batch of real symmetric positive definite matrices.
 
     \details
-    (This is the blocked version of the algorithm). 
+    (This is the blocked version of the algorithm).
 
     The factorization of matrix A_i in the batch has the form:
 
@@ -2716,28 +2716,28 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotrf_batched(rocsolver_handle hand
               The dimension of matrix A_i.
     @param[inout]
     A         pointer to type. Array on the GPU (the size depends on the value of strideA).\n
-              On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. 
+              On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors.
     @param[in]
     lda       rocsolver_int. lda >= n.\n
               specifies the leading dimension of A_i.
     @param[in]
     strideA   rocsolver_int.\n
-              Stride from the start of one matrix A_i and the next one A_(i+1). 
+              Stride from the start of one matrix A_i and the next one A_(i+1).
               There is no restriction for the value of strideA. Normal use case is strideA >= lda*n.
     @param[out]
     info      pointer to rocsolver_int. Array of batch_count integers on the GPU.\n
-              If info_i = 0, succesful factorization of matrix A_i. 
-              If info_i = j > 0, the leading minor of order j of A_i is not positive definite. 
+              If info_i = 0, succesful factorization of matrix A_i.
+              If info_i = j > 0, the leading minor of order j of A_i is not positive definite.
               The i-th factorization stopped at this point.
     @param[in]
     batch_count rocsolver_int. batch_count >= 0.\n
-                Number of matrices in the batch. 
+                Number of matrices in the batch.
 
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_spotrf_strided_batched(rocsolver_handle handle,
                                                                    const rocsolver_fill uplo,
-                                                                   const rocsolver_int n, 
+                                                                   const rocsolver_int n,
                                                                    float *A,
                                                                    const rocsolver_int lda,
                                                                    const rocsolver_int strideA,
@@ -2746,7 +2746,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotrf_strided_batched(rocsolver_han
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotrf_strided_batched(rocsolver_handle handle,
                                                                    const rocsolver_fill uplo,
-                                                                   const rocsolver_int n, 
+                                                                   const rocsolver_int n,
                                                                    double *A,
                                                                    const rocsolver_int lda,
                                                                    const rocsolver_int strideA,
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/include/rocsolver-types.h b/ROCm_Libraries/rocSOLVER/docs/library/include/rocsolver-types.h
index 55d3e42a..e8cf8251 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/include/rocsolver-types.h
+++ b/ROCm_Libraries/rocSOLVER/docs/library/include/rocsolver-types.h
@@ -11,8 +11,8 @@
 
 #include <rocblas.h>
 
-/*! \brief Used to specify int32 or int64. 
-    \details rocsolver_int is a rocblas_int 
+/*! \brief Used to specify int32 or int64.
+    \details rocsolver_int is a rocblas_int
  ******************************************************************/
 typedef rocblas_int rocsolver_int;
 
@@ -20,12 +20,12 @@ typedef rocblas_float_complex rocsolver_float_complex;
 typedef rocblas_double_complex rocsolver_double_complex;
 typedef rocblas_half rocsolver_half;
 
-/*! \brief A structure holding the rocsolver library context. 
-    \details 
+/*! \brief A structure holding the rocsolver library context.
+    \details
     It must be initialized using rocsolver_create_handle()
-    and the returned handle must be passed to all subsequent library 
+    and the returned handle must be passed to all subsequent library
     function calls. It should be destroyed at the end using rocsolver_destroy_handle().\n
-    rocsolver_handle is a rocblas_handle. 
+    rocsolver_handle is a rocblas_handle.
  *************************************************************************/
 typedef rocblas_handle rocsolver_handle;
 
@@ -56,16 +56,16 @@ typedef rocblas_status rocsolver_status;
 
 typedef rocblas_layer_mode rocsolver_layer_mode;
 
-/*! \brief Used to specify the order in which multiple elementary matrices are applied together 
- ********************************************************************************/ 
+/*! \brief Used to specify the order in which multiple elementary matrices are applied together
+ ********************************************************************************/
 typedef enum rocsolver_direct_
 {
     rocsolver_forward_direction = 171, /**< Elementary matrices applied from the right. */
     rocsolver_backward_direction = 172, /**< Elementary matrices applied from the left. */
 } rocsolver_direct;
 
-/*! \brief Used to specify how householder vectors are stored in a matrix of vectors 
- ********************************************************************************/ 
+/*! \brief Used to specify how householder vectors are stored in a matrix of vectors
+ ********************************************************************************/
 typedef enum rocsolver_storev_
 {
     rocsolver_column_wise = 181, /**< Householder vectors are stored in the columns of a matrix. */
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/CMakeLists.txt b/ROCm_Libraries/rocSOLVER/docs/library/src/CMakeLists.txt
index cbf3d10d..4a435950 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/CMakeLists.txt
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/CMakeLists.txt
@@ -82,7 +82,7 @@ add_library( rocsolver
   ${rocsolver_lapack_source}
   ${relative_rocsolver_headers_public}
   ${rocsolver_auxiliary_source}
-  ${rocsolver_common_source}  
+  ${rocsolver_common_source}
 )
 
 add_library( roc::rocsolver ALIAS rocsolver )
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larf.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larf.cpp
index 9c52fd62..8c4e0c70 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larf.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larf.cpp
@@ -5,7 +5,7 @@
 #include "rocauxiliary_larf.hpp"
 
 template <typename T>
-rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, 
+rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m,
                                    const rocsolver_int n, T* x, const rocsolver_int incx, const T* alpha,
                                    T* A, const rocsolver_int lda)
 {
@@ -24,7 +24,7 @@ rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side
     rocblas_int stridep = 0;
     rocblas_int batch_count=1;
 
-    return rocsolver_larf_template<T>(handle,side, 
+    return rocsolver_larf_template<T>(handle,side,
                                       m,n,
                                       x,0,    //vector shifted 0 entries
                                       incx,
@@ -33,7 +33,7 @@ rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side
                                       stridep,
                                       A,0,       //matrix shifted 0 entries
                                       lda,
-                                      stridea, 
+                                      stridea,
                                       batch_count);
 }
 
@@ -46,14 +46,14 @@ rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side
 
 extern "C" {
 
-ROCSOLVER_EXPORT rocblas_status rocsolver_slarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, 
+ROCSOLVER_EXPORT rocblas_status rocsolver_slarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m,
                                                 const rocsolver_int n, float* x, const rocsolver_int incx, const float* alpha,
                                                 float* A, const rocsolver_int lda)
 {
     return rocsolver_larf_impl<float>(handle, side, m, n, x, incx, alpha, A, lda);
 }
 
-ROCSOLVER_EXPORT rocblas_status rocsolver_dlarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, 
+ROCSOLVER_EXPORT rocblas_status rocsolver_dlarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m,
                                                 const rocsolver_int n, double* x, const rocsolver_int incx, const double* alpha,
                                                 double* A, const rocsolver_int lda)
 {
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larf.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larf.hpp
index 27a5a0d4..3755ea14 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larf.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larf.hpp
@@ -19,8 +19,8 @@
 
 template <typename T, typename U>
 rocblas_status rocsolver_larf_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m,
-                                        const rocsolver_int n, U x, const rocblas_int shiftx, const rocsolver_int incx, 
-                                        const rocblas_int stridex, const T* alpha, const rocblas_int stridep, U A, const rocblas_int shiftA, 
+                                        const rocsolver_int n, U x, const rocblas_int shiftx, const rocsolver_int incx,
+                                        const rocblas_int stridex, const T* alpha, const rocblas_int stridep, U A, const rocblas_int shiftA,
                                         const rocsolver_int lda, const rocblas_int stridea, const rocblas_int batch_count)
 {
     // quick return
@@ -40,7 +40,7 @@ rocblas_status rocsolver_larf_template(rocsolver_handle handle, const rocsolver_
     T* zeroInt;                 //constant 0 in device
     hipMalloc(&zeroInt, sizeof(T));
     hipMemcpy(zeroInt, &zero, sizeof(T), hipMemcpyHostToDevice);
-    
+
     #ifdef batched
         // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL
         //      BATCH-BLAS FUNCTIONALITY IS ENABLED. ****
@@ -66,16 +66,16 @@ rocblas_status rocsolver_larf_template(rocsolver_handle handle, const rocsolver_
     //      OF A AND X, AS THIS WOULD REQUIRE SYNCHRONIZATION WITH GPU.
     //      IT WILL WORK ON THE ENTIRE MATRIX/VECTOR REGARDLESS OF
     //      ZERO ENTRIES ****
- 
+
     //memory in GPU (workspace)
     T *workvec;
     hipMalloc(&workvec, sizeof(T)*order*batch_count);
 
-    
+
     // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS
     //      FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
-    
+
     //compute the matrix vector product  (W=tau*A'*X or W=tau*A*X)
     for (int b=0;b<batch_count;++b) {
         xp = load_ptr_batch<T>(xx,shiftx,b,stridex);
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfb.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfb.cpp
index 12ed4e92..d28b4a03 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfb.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfb.cpp
@@ -5,10 +5,10 @@
 #include "rocauxiliary_larfb.hpp"
 
 template <typename T>
-rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_side side, 
-                                    const rocsolver_operation trans, const rocsolver_direct direct, 
+rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_side side,
+                                    const rocsolver_operation trans, const rocsolver_direct direct,
                                     const rocsolver_storev storev,
-                                    const rocsolver_int m, const rocsolver_int n, 
+                                    const rocsolver_int m, const rocsolver_int n,
                                     const rocsolver_int k, T* V, const rocsolver_int ldv, T* F, const rocsolver_int ldf,
                                     T* A, const rocsolver_int lda)
 {
@@ -22,7 +22,7 @@ rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_sid
     if (storev == rocsolver_row_wise) {
         if (ldv < k)
             return rocblas_status_invalid_size;
-    } else {    
+    } else {
         if ((side == rocblas_side_left && ldv < m) || (side == rocblas_side_right && ldv < n))
             return rocblas_status_invalid_size;
     }
@@ -34,7 +34,7 @@ rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_sid
     rocblas_int stridef = 0;
     rocblas_int batch_count=1;
 
-    return rocsolver_larfb_template<T>(handle,side,trans,direct,storev, 
+    return rocsolver_larfb_template<T>(handle,side,trans,direct,storev,
                                       m,n,k,
                                       V,0,      //shifted 0 entries
                                       ldv,
@@ -44,7 +44,7 @@ rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_sid
                                       stridef,
                                       A,0,      //shifted 0 entries
                                       lda,
-                                      stridea, 
+                                      stridea,
                                       batch_count);
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfb.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfb.hpp
index 5214e29a..dc4ee469 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfb.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfb.hpp
@@ -19,7 +19,7 @@
 
 
 template <typename T, typename U>
-__global__ void copymatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work) 
+__global__ void copymatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work)
 {
     const auto blocksizex = hipBlockDim_x;
     const auto blocksizey = hipBlockDim_y;
@@ -38,7 +38,7 @@ __global__ void copymatA1(const rocsolver_int ldw, const rocsolver_int order, U
 }
 
 template <typename T, typename U>
-__global__ void addmatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work) 
+__global__ void addmatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work)
 {
     const auto blocksizex = hipBlockDim_x;
     const auto blocksizey = hipBlockDim_y;
@@ -52,18 +52,18 @@ __global__ void addmatA1(const rocsolver_int ldw, const rocsolver_int order, U A
         Wp = work + b*strideW;
         Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
 
-        Ap[i + j*lda] -= Wp[i + j*ldw];    
+        Ap[i + j*lda] -= Wp[i + j*ldw];
     }
 }
 
 template <typename T, typename U>
-rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver_side side, 
-                                        const rocsolver_operation trans, const rocsolver_direct direct, 
+rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver_side side,
+                                        const rocsolver_operation trans, const rocsolver_direct direct,
                                         const rocsolver_storev storev,
                                         const rocsolver_int m, const rocsolver_int n,
-                                        const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv, 
+                                        const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv,
                                         const rocsolver_int strideV, T *F, const rocsolver_int shiftF,
-                                        const rocsolver_int ldf, const rocsolver_int strideF, 
+                                        const rocsolver_int ldf, const rocsolver_int strideF,
                                         U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA,
                                         const rocsolver_int batch_count)
 {
@@ -100,14 +100,14 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
 
     //determine the side, size of workspace
     //and whether V is trapezoidal
-    rocsolver_operation transp; 
+    rocsolver_operation transp;
     rocsolver_fill uploV;
     bool trap;
     rocblas_int order, ldw;
-    bool colwise = (storev == rocsolver_column_wise); 
+    bool colwise = (storev == rocsolver_column_wise);
     bool leftside = (side == rocblas_side_left);
     size_t offsetV;
-    
+
     if (leftside) {
         order = n;
         ldw = k;
@@ -120,16 +120,16 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
     if (colwise) {
         uploV = rocblas_fill_lower;
         offsetV = idx2D(k,0,ldv);
-        if (leftside) 
+        if (leftside)
             transp = rocblas_operation_transpose;
-        else 
+        else
             transp = rocblas_operation_none;
     } else {
         uploV = rocblas_fill_upper;
         offsetV = idx2D(0,k,ldv);
-        if (leftside) 
+        if (leftside)
             transp = rocblas_operation_none;
-        else 
+        else
             transp = rocblas_operation_transpose;
     }
 
@@ -146,15 +146,15 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
     rocblas_int blocksx = (order - 1)/32 + 1;
     rocblas_int blocksy = (ldw - 1)/32 + 1;
     hipLaunchKernelGGL(copymatA1,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,ldw,order,A,shiftA,lda,strideA,work);
-    
+
     // BACKWARD DIRECTION TO BE IMPLEMENTED...
     rocsolver_fill uploT = rocblas_fill_upper;
     if (direct == rocsolver_backward_direction)
         return rocblas_status_not_implemented;
-    
+
     //compute:
     // V1' * A1, or
-    //   or 
+    //   or
     // A1 * V1
     for (int b=0;b<batch_count;++b) {
         Vp = load_ptr_batch<T>(VV,shiftV,b,strideV);
@@ -162,14 +162,14 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
     }
 
     // compute:
-    // V1' * A1 + V2' * A2 
-    //        or 
+    // V1' * A1 + V2' * A2
+    //        or
     // A1 * V1 + A2 * V2
-    if (trap) { 
+    if (trap) {
         for (int b=0;b<batch_count;++b) {
             Ap = load_ptr_batch<T>(AA,shiftA,b,strideA);
             Vp = load_ptr_batch<T>(VV,shiftV,b,strideV);
-            if (leftside) { 
+            if (leftside) {
                 rocblas_gemm(handle,transp,rocblas_operation_none,ldw,order,m-k,oneInt,
                              (Vp + offsetV),ldv,
                              (Ap + idx2D(k,0,lda)),lda,
@@ -183,10 +183,10 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
         }
     }
 
-    // compute: 
+    // compute:
     // trans(T) * (V1' * A1 + V2' * A2)
     //              or
-    // (A1 * V1 + A2 * V2) * trans(T)    
+    // (A1 * V1 + A2 * V2) * trans(T)
     for (int b=0;b<batch_count;++b) {
         Fp = load_ptr_batch<T>(FF,shiftF,b,strideF);
         rocblas_trmm(handle,side,uploT,trans,rocblas_diagonal_non_unit,ldw,order,oneInt,Fp,ldf,(work + b*strideW),ldw);
@@ -195,7 +195,7 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
     // compute:
     // A2 - V2 * trans(T) * (V1' * A1 + V2' * A2)
     //              or
-    // A2 - (A1 * V1 + A2 * V2) * trans(T) * V2'    
+    // A2 - (A1 * V1 + A2 * V2) * trans(T) * V2'
     if (transp == rocblas_operation_transpose)
         transp = rocblas_operation_none;
     else
@@ -205,7 +205,7 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
         for (int b=0;b<batch_count;++b) {
             Ap = load_ptr_batch<T>(AA,shiftA,b,strideA);
             Vp = load_ptr_batch<T>(VV,shiftV,b,strideV);
-            if (leftside) { 
+            if (leftside) {
                 rocblas_gemm(handle,transp,rocblas_operation_none,m-k,order,ldw,minoneInt,
                              (Vp + offsetV),ldv,
                              (work + b*strideW),ldw,
@@ -218,22 +218,22 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
             }
         }
     }
-        
+
     // compute:
     // V1 * trans(T) * (V1' * A1 + V2' * A2)
     //              or
-    // (A1 * V1 + A2 * V2) * trans(T) * V1'    
+    // (A1 * V1 + A2 * V2) * trans(T) * V1'
     for (int b=0;b<batch_count;++b) {
         Vp = load_ptr_batch<T>(VV,shiftV,b,strideV);
         rocblas_trmm(handle,side,uploV,transp,rocblas_diagonal_unit,ldw,order,oneInt,Vp,ldv,(work + b*strideW),ldw);
     }
-    
+
     // compute:
     // A1 - V1 * trans(T) * (V1' * A1 + V2' * A2)
     //              or
     // A1 - (A1 * V1 + A2 * V2) * trans(T) * V1'
     hipLaunchKernelGGL(addmatA1,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,ldw,order,A,shiftA,lda,strideA,work);
-    
+
     hipFree(minoneInt);
     hipFree(oneInt);
     hipFree(work);
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfg.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfg.cpp
index 4b1e00fa..8e651066 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfg.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfg.cpp
@@ -26,7 +26,7 @@ rocblas_status rocsolver_larfg_impl(rocblas_handle handle, const rocblas_int n,
                                         incx,
                                         stridex,
                                         tau,
-                                        strideP, 
+                                        strideP,
                                         batch_count);
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfg.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfg.hpp
index f4fc193c..38683f5d 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfg.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larfg.hpp
@@ -42,7 +42,7 @@ __global__ void set_taubeta(T *tau, const rocblas_int strideP, T *norms, U alpha
 
 
 template <typename T, typename U>
-rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int n, U alpha, const rocblas_int shifta, 
+rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int n, U alpha, const rocblas_int shifta,
                                         U x, const rocblas_int shiftx, const rocblas_int incx, const rocblas_int stridex,
                                         T *tau, const rocblas_int strideP, const rocblas_int batch_count)
 {
@@ -54,11 +54,11 @@ rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
     dim3 gridReset(1, batch_count, 1);
-    dim3 threads(1, 1, 1); 
+    dim3 threads(1, 1, 1);
     if (n == 1) {
         hipLaunchKernelGGL(reset_batch_info,gridReset,threads,0,stream,tau,strideP,1,0);
-        return rocblas_status_success;    
-    } 
+        return rocblas_status_success;
+    }
 
     T *xp;
 
@@ -73,12 +73,12 @@ rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int
 
     //memory in GPU (workspace)
     T *norms;
-    hipMalloc(&norms, sizeof(T)*batch_count);    
+    hipMalloc(&norms, sizeof(T)*batch_count);
 
     // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS
     //      FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
-    
+
     //compute norm of x
     for (int b=0;b<batch_count;++b) {
         xp = load_ptr_batch<T>(xx,shiftx,b,stridex);
@@ -87,9 +87,9 @@ rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int
 
     //set value of tau and beta and scalling factor for vector x
     //alpha <- beta
-    //norms <- scalling   
+    //norms <- scalling
     hipLaunchKernelGGL(set_taubeta<T>,dim3(batch_count),dim3(1),0,stream,tau,strideP,norms,alpha,shifta,stridex);
-     
+
     //compute vector v=x*norms
     for (int b=0;b<batch_count;++b) {
         xp = load_ptr_batch<T>(xx,shiftx,b,stridex);
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larft.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larft.cpp
index 5ab79a92..10915015 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larft.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larft.cpp
@@ -5,8 +5,8 @@
 #include "rocauxiliary_larft.hpp"
 
 template <typename T>
-rocblas_status rocsolver_larft_impl(rocsolver_handle handle, const rocsolver_direct direct, 
-                                   const rocsolver_storev storev, const rocsolver_int n, 
+rocblas_status rocsolver_larft_impl(rocsolver_handle handle, const rocsolver_direct direct,
+                                   const rocsolver_storev storev, const rocsolver_int n,
                                    const rocsolver_int k, T* V, const rocsolver_int ldv, T* tau,
                                    T* F, const rocsolver_int ldf)
 {
@@ -38,7 +38,7 @@ rocblas_status rocsolver_larft_impl(rocsolver_handle handle, const rocsolver_dir
                                       stridet,
                                       F,
                                       ldf,
-                                      stridef, 
+                                      stridef,
                                       batch_count);
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larft.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larft.hpp
index ee2add09..8a38ac3f 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larft.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_larft.hpp
@@ -17,8 +17,8 @@
 #include "common_device.hpp"
 
 template <typename T, typename U>
-__global__ void set_triangular(const rocsolver_int k, U V, const rocsolver_int shiftV, const rocsolver_int ldv, const rocsolver_int strideV, 
-                         T* tau, const rocsolver_int strideT, 
+__global__ void set_triangular(const rocsolver_int k, U V, const rocsolver_int shiftV, const rocsolver_int ldv, const rocsolver_int strideV,
+                         T* tau, const rocsolver_int strideT,
                          T* F, const rocsolver_int ldf, const rocsolver_int strideF, const rocsolver_storev storev)
 {
     const auto blocksize = hipBlockDim_x;
@@ -51,20 +51,20 @@ __global__ void set_tau(const rocsolver_int k, T* tau, const rocsolver_int strid
     const auto blocksize = hipBlockDim_x;
     const auto b = hipBlockIdx_x;
     const auto i = hipBlockIdx_y * blocksize + hipThreadIdx_x;
-   
+
     if (i < k) {
         T *tp;
         tp = tau + b*strideT;
         tp[i] = -tp[i];
     }
 }
-         
+
 
 template <typename T, typename U>
-rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver_direct direct, 
+rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver_direct direct,
                                    const rocsolver_storev storev, const rocsolver_int n,
-                                   const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv, 
-                                   const rocsolver_int strideV, T* tau, const rocsolver_int strideT, T* F, 
+                                   const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv,
+                                   const rocsolver_int strideV, T* tau, const rocsolver_int strideT, T* F,
                                    const rocsolver_int ldf, const rocsolver_int strideF, const rocsolver_int batch_count)
 {
     // quick return
@@ -84,7 +84,7 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver
     hipMemcpy(oneInt, &one, sizeof(T), hipMemcpyHostToDevice);
     hipMalloc(&zeroInt, sizeof(T));
     hipMemcpy(zeroInt, &zero, sizeof(T), hipMemcpyHostToDevice);
-    
+
     #ifdef batched
         // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL
         //      BATCH-BLAS FUNCTIONALITY IS ENABLED. ****
@@ -98,26 +98,26 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver
     if (direct == rocsolver_backward_direction)
         return rocblas_status_not_implemented;
 
-    //Fix diagonal of T, make zero the non used triangular part, 
+    //Fix diagonal of T, make zero the non used triangular part,
     //setup tau (changing signs) and account for the non-stored 1's on the householder vectors
     rocblas_int blocks = (k - 1)/32 + 1;
     hipLaunchKernelGGL(set_triangular,dim3(blocks,blocks,batch_count),dim3(32,32),0,stream,
                         k,V,shiftV,ldv,strideV,tau,strideT,F,ldf,strideF,storev);
     hipLaunchKernelGGL(set_tau,dim3(batch_count,blocks),dim3(32,1),0,stream,k,tau,strideT);
 
-    // **** FOR NOW, IT DOES NOT LOOK FOR TRAILING ZEROS 
+    // **** FOR NOW, IT DOES NOT LOOK FOR TRAILING ZEROS
     //      AS THIS WOULD REQUIRE SYNCHRONIZATION WITH GPU.
     //      IT WILL WORK ON THE ENTIRE MATRIX/VECTOR REGARDLESS OF
     //      ZERO ENTRIES ****
- 
+
     // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS
     //      FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
-    
-    rocblas_operation trans;  
 
-    
-    for (int i = 1; i < k; ++i) { 
+    rocblas_operation trans;
+
+
+    for (int i = 1; i < k; ++i) {
         //compute the matrix vector product, using the householder vectors
         for (int b=0;b<batch_count;++b) {
             tp = tau + b*strideT;
@@ -137,13 +137,13 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver
         //multiply by the previous triangular factor
         //THIS SHOULD BE DONE USING TRMV ONCE THIS
         //FUNCTIONALITY IS AVAILABLE IN ROCBLAS
-        trans = rocblas_operation_none; 
+        trans = rocblas_operation_none;
         for (int b=0;b<batch_count;++b) {
             Vp = load_ptr_batch<T>(VV,shiftV,b,strideV);
             Fp = F + b*strideF;
-            rocblas_gemv(handle, trans, i, i, oneInt, Fp, ldf, 
+            rocblas_gemv(handle, trans, i, i, oneInt, Fp, ldf,
                         (Fp + idx2D(0,i,ldf)), 1, zeroInt, (Fp + idx2D(0,i,ldf)), 1);
-        } 
+        }
     }
 
     //restore tau
@@ -151,7 +151,7 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver
 
     hipFree(oneInt);
     hipFree(zeroInt);
-    
+
     return rocblas_status_success;
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_laswp.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_laswp.cpp
index e79f652f..360fef79 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_laswp.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_laswp.cpp
@@ -54,14 +54,14 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dlaswp(rocsolver_handle handle, const
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_claswp(rocsolver_handle handle, const rocsolver_int n,
-                 rocblas_float_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2, 
+                 rocblas_float_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2,
                  const rocsolver_int *ipiv, const rocblas_int incx)
 {
     return rocsolver_laswp_impl<rocblas_float_complex>(handle, n, A, lda, k1, k2, ipiv, incx);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zlaswp(rocsolver_handle handle, const rocsolver_int n,
-                 rocblas_double_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2, 
+                 rocblas_double_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2,
                  const rocsolver_int *ipiv, const rocblas_int incx)
 {
     return rocsolver_laswp_impl<rocblas_double_complex>(handle, n, A, lda, k1, k2, ipiv, incx);
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_laswp.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_laswp.hpp
index 0dc74205..4615a7ec 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_laswp.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_laswp.hpp
@@ -51,10 +51,10 @@ __global__ void laswp_kernel(const rocblas_int n, U AA, const rocblas_int shiftA
 template <typename T, typename U>
 rocblas_status rocsolver_laswp_template(rocblas_handle handle, const rocblas_int n, U A, const rocblas_int shiftA,
                               const rocblas_int lda, const rocblas_int strideA, const rocblas_int k1, const rocblas_int k2,
-                              const rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int incx, 
+                              const rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int incx,
                               const rocblas_int batch_count) {
     // quick return
-    if (n == 0 || !batch_count) 
+    if (n == 0 || !batch_count)
         return rocblas_status_success;
 
     rocblas_int start, end, inc;
@@ -63,7 +63,7 @@ rocblas_status rocsolver_laswp_template(rocblas_handle handle, const rocblas_int
         end = k1 - 1;
         inc = -1;
         incx = -incx;
-    } 
+    }
     else {
         start = k1;
         end = k2 + 1;
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_org2r.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_org2r.cpp
index 102fd83e..465b3635 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_org2r.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_org2r.cpp
@@ -5,7 +5,7 @@
 #include "rocauxiliary_org2r.hpp"
 
 template <typename T>
-rocblas_status rocsolver_org2r_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_org2r_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv)
 {
     if(!handle)
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_org2r.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_org2r.hpp
index 08d072aa..2dbcc11e 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_org2r.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_org2r.hpp
@@ -29,10 +29,10 @@ __global__ void init_ident_col(const rocblas_int m, const rocblas_int n, const r
 
     if (i < m && j < n) {
         T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
-        
-        if (i == j) 
+
+        if (i == j)
             Ap[i + j*lda] = 1.0;
-        else if (j > i) 
+        else if (j > i)
             Ap[i + j*lda] = 0.0;
         else if (j >= k)
             Ap[i + j*lda] = 0.0;
@@ -40,9 +40,9 @@ __global__ void init_ident_col(const rocblas_int m, const rocblas_int n, const r
 }
 
 template <typename T, typename U>
-rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver_int m, 
-                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, 
-                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver_int m,
+                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA,
+                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, const rocsolver_int batch_count)
 {
     // quick return
@@ -51,7 +51,7 @@ rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver
 
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
-    
+
     #ifdef batched
         // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL
         //      BATCH-BLAS FUNCTIONALITY IS ENABLED. ****
@@ -64,7 +64,7 @@ rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver
     // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS
     //      FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
-    
+
     T* M;
 
     // Initialize identity matrix (non used columns)
@@ -78,34 +78,34 @@ rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver
         if (j < n - 1) {
             rocsolver_larf_template(handle,rocblas_side_left,           //side
                                     m - j,                              //number of rows of matrix to modify
-                                    n - j - 1,                          //number of columns of matrix to modify    
+                                    n - j - 1,                          //number of columns of matrix to modify
                                     A, shiftA + idx2D(j,j,lda),         //householder vector x
                                     1, strideA,                         //inc of x
                                     (ipiv + j), strideP,                //householder scalar (alpha)
                                     A, shiftA + idx2D(j,j+1,lda),       //matrix to work on
                                     lda, strideA,                       //leading dimension
-                                    batch_count);          
+                                    batch_count);
         }
 
         // set the diagonal element and negative tau
         hipLaunchKernelGGL(setdiag<T>,dim3(batch_count),dim3(1),0,stream,
                             j,A,shiftA,lda,strideA,ipiv,strideP);
-        
+
         // update i-th column -corresponding to H(i)-
         if (j < m - 1) {
             for (int b=0;b<batch_count;++b) {
                 M = load_ptr_batch<T>(AA,shiftA,b,strideA);
-                rocblas_scal(handle, (m-j-1), (ipiv + b*strideP + j), 
-                            (M + idx2D(j + 1, j, lda)), 1); 
-            }          
+                rocblas_scal(handle, (m-j-1), (ipiv + b*strideP + j),
+                            (M + idx2D(j + 1, j, lda)), 1);
+            }
         }
     }
-    
+
     // restore values of tau
     blocksx = (k - 1)/128 + 1;
     hipLaunchKernelGGL(restau<T>,dim3(blocksx,batch_count),dim3(128),0,stream,
                             k,ipiv,strideP);
- 
+
     return rocblas_status_success;
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgbr.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgbr.cpp
index bd3e4714..eb4f0bb6 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgbr.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgbr.cpp
@@ -5,8 +5,8 @@
 #include "rocauxiliary_orgbr.hpp"
 
 template <typename T>
-rocblas_status rocsolver_orgbr_impl(rocsolver_handle handle, const rocsolver_storev storev, 
-                                   const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_orgbr_impl(rocsolver_handle handle, const rocsolver_storev storev,
+                                   const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv)
 {
     if(!handle)
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgbr.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgbr.hpp
index a1315b6e..deec30a8 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgbr.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgbr.hpp
@@ -23,7 +23,7 @@
 #define BS 32 //blocksize for kernels
 
 template <typename T, typename U>
-__global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, 
+__global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA,
                          T *W, const rocsolver_int shiftW, const rocsolver_int ldw, const rocsolver_int strideW)
 {
     const auto b = hipBlockIdx_z;
@@ -33,17 +33,17 @@ __global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const
     if (i < dim && j < dim && j <= i) {
         rocblas_int offset = j*(j+1)/2; //to acommodate in smaller array W
 
-        T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);    
+        T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
         T *Wp = load_ptr_batch<T>(W,shiftW,b,strideW);
-        
+
         if (copy) {
             //copy columns
-            Wp[i + j*ldw - offset] = (j == 0 ? 0.0 : Ap[i+1 + (j-1)*lda]);    
-        
+            Wp[i + j*ldw - offset] = (j == 0 ? 0.0 : Ap[i+1 + (j-1)*lda]);
+
         } else {
-            // shift columns to the right   
+            // shift columns to the right
             Ap[i+1 + j*lda] = Wp[i + j*ldw - offset];
-            
+
             // make first row the identity
             if (i == j) {
                 Ap[(j+1)*lda] = 0.0;
@@ -55,7 +55,7 @@ __global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const
 }
 
 template <typename T, typename U>
-__global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, 
+__global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA,
                          T *W, const rocsolver_int shiftW, const rocsolver_int ldw, const rocsolver_int strideW)
 {
     const auto b = hipBlockIdx_z;
@@ -65,17 +65,17 @@ __global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const
     if (i < dim && j < dim && i <= j) {
         rocblas_int offset = j*ldw - j*(j+1)/2; //to acommodate in smaller array W
 
-        T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);    
+        T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
         T *Wp = load_ptr_batch<T>(W,shiftW,b,strideW);
-        
+
         if (copy) {
             //copy rows
-            Wp[i + j*ldw - offset] = (i == 0 ? 0.0 : Ap[i-1 + (j+1)*lda]);    
-        
+            Wp[i + j*ldw - offset] = (i == 0 ? 0.0 : Ap[i-1 + (j+1)*lda]);
+
         } else {
-            // shift rows downward   
+            // shift rows downward
             Ap[i + (j+1)*lda] = Wp[i + j*ldw - offset];
-            
+
             // make first column the identity
             if (i == j) {
                 Ap[i+1] = 0.0;
@@ -87,9 +87,9 @@ __global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const
 }
 
 template <typename T, typename U>
-rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver_storev storev, const rocsolver_int m, 
-                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, 
-                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver_storev storev, const rocsolver_int m,
+                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA,
+                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, const rocsolver_int batch_count)
 {
     // quick return
@@ -99,11 +99,11 @@ rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
 
-    // if column-wise, compute orthonormal columns of matrix Q in the bi-diagonalization 
+    // if column-wise, compute orthonormal columns of matrix Q in the bi-diagonalization
     // of a m-by-k matrix A (given by gebrd)
     if (storev == rocsolver_column_wise) {
         if (m >= k) {
-            rocsolver_orgqr_template<T>(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count);    
+            rocsolver_orgqr_template<T>(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count);
         } else {
             // shift the householder vectors provided by gebrd as they come below the first subdiagonal
             // workspace
@@ -115,21 +115,21 @@ rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver
             rocblas_int blocks = (m - 2)/BS + 1;
 
             // copy
-            hipLaunchKernelGGL(copyshift_col<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, 
-                                true,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW);           
+            hipLaunchKernelGGL(copyshift_col<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream,
+                                true,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW);
 
             // shift
-            hipLaunchKernelGGL(copyshift_col<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, 
-                                false,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW);           
-            
+            hipLaunchKernelGGL(copyshift_col<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream,
+                                false,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW);
+
             // result
-            rocsolver_orgqr_template<T>(handle, m-1, m-1, m-1, A, shiftA + idx2D(1,1,lda), lda, strideA, ipiv, strideP, batch_count);    
-        
+            rocsolver_orgqr_template<T>(handle, m-1, m-1, m-1, A, shiftA + idx2D(1,1,lda), lda, strideA, ipiv, strideP, batch_count);
+
             hipFree(W);
-        }   
+        }
     }
-    
-    // if row-wise, compute orthonormal rowss of matrix P' in the bi-diagonalization 
+
+    // if row-wise, compute orthonormal rowss of matrix P' in the bi-diagonalization
     // of a k-by-n matrix A (given by gebrd)
     else {
         if (n > k) {
@@ -145,19 +145,19 @@ rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver
             rocblas_int blocks = (n - 2)/BS + 1;
 
             // copy
-            hipLaunchKernelGGL(copyshift_row<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, 
-                                true,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW);           
+            hipLaunchKernelGGL(copyshift_row<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream,
+                                true,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW);
 
             // shift
-            hipLaunchKernelGGL(copyshift_row<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, 
-                                false,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW);           
+            hipLaunchKernelGGL(copyshift_row<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream,
+                                false,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW);
 
             // result
             rocsolver_orglq_template<T>(handle, n-1, n-1, n-1, A, shiftA + idx2D(1,1,lda), lda, strideA, ipiv, strideP, batch_count);
-                
+
             hipFree(W);
         }
-    }    
+    }
 
     return rocblas_status_success;
 }
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgl2.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgl2.cpp
index 27e3d8ed..ec38dc16 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgl2.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgl2.cpp
@@ -5,7 +5,7 @@
 #include "rocauxiliary_orgl2.hpp"
 
 template <typename T>
-rocblas_status rocsolver_orgl2_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_orgl2_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv)
 {
     if(!handle)
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgl2.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgl2.hpp
index 202a4fc3..35475070 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgl2.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgl2.hpp
@@ -29,10 +29,10 @@ __global__ void init_ident_row(const rocblas_int m, const rocblas_int n, const r
 
     if (i < m && j < n) {
         T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
-        
-        if (i == j) 
+
+        if (i == j)
             Ap[i + j*lda] = 1.0;
-        else if (j < i) 
+        else if (j < i)
             Ap[i + j*lda] = 0.0;
         else if (i >= k)
             Ap[i + j*lda] = 0.0;
@@ -40,9 +40,9 @@ __global__ void init_ident_row(const rocblas_int m, const rocblas_int n, const r
 }
 
 template <typename T, typename U>
-rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver_int m, 
-                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, 
-                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver_int m,
+                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA,
+                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, const rocsolver_int batch_count)
 {
     // quick return
@@ -51,7 +51,7 @@ rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver
 
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
-    
+
     #ifdef batched
         // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL
         //      BATCH-BLAS FUNCTIONALITY IS ENABLED. ****
@@ -64,7 +64,7 @@ rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver
     // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS
     //      FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
-    
+
     T* M;
 
     // Initialize identity matrix (non used columns)
@@ -78,34 +78,34 @@ rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver
         if (j < m - 1) {
             rocsolver_larf_template(handle,rocblas_side_right,          //side
                                     m - j - 1,                          //number of rows of matrix to modify
-                                    n - j,                              //number of columns of matrix to modify    
+                                    n - j,                              //number of columns of matrix to modify
                                     A, shiftA + idx2D(j,j,lda),         //householder vector x
                                     lda, strideA,                       //inc of x
                                     (ipiv + j), strideP,                //householder scalar (alpha)
                                     A, shiftA + idx2D(j+1,j,lda),       //matrix to work on
                                     lda, strideA,                       //leading dimension
-                                    batch_count);          
+                                    batch_count);
         }
 
         // set the diagonal element and negative tau
         hipLaunchKernelGGL(setdiag<T>,dim3(batch_count),dim3(1),0,stream,
                             j,A,shiftA,lda,strideA,ipiv,strideP);
-        
+
         // update i-th row -corresponding to H(i)-
         if (j < n - 1) {
             for (int b=0;b<batch_count;++b) {
                 M = load_ptr_batch<T>(AA,shiftA,b,strideA);
-                rocblas_scal(handle, (n-j-1), (ipiv + b*strideP + j), 
-                            (M + idx2D(j, j + 1, lda)), lda); 
-            }          
+                rocblas_scal(handle, (n-j-1), (ipiv + b*strideP + j),
+                            (M + idx2D(j, j + 1, lda)), lda);
+            }
         }
     }
-    
+
     // restore values of tau
     blocksx = (k - 1)/128 + 1;
     hipLaunchKernelGGL(restau<T>,dim3(blocksx,batch_count),dim3(128),0,stream,
                             k,ipiv,strideP);
- 
+
     return rocblas_status_success;
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orglq.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orglq.cpp
index 35b17482..e3039734 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orglq.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orglq.cpp
@@ -5,7 +5,7 @@
 #include "rocauxiliary_orglq.hpp"
 
 template <typename T>
-rocblas_status rocsolver_orglq_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_orglq_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv)
 {
     if(!handle)
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orglq.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orglq.hpp
index 97886fce..39f77a46 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orglq.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orglq.hpp
@@ -32,16 +32,16 @@ __global__ void set_zero_row(const rocblas_int m, const rocblas_int kk, U A,
 
     if (i < m && j < kk) {
         T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
-        
+
         Ap[i + j*lda] = 0.0;
     }
 }
 
 
 template <typename T, typename U>
-rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver_int m, 
-                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, 
-                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver_int m,
+                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA,
+                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, const rocsolver_int batch_count)
 {
     // quick return
@@ -50,9 +50,9 @@ rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver
 
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
-    
+
     // if the matrix is small, use the unblocked variant of the algorithm
-    if (k <= GEQRF_GEQR2_SWITCHSIZE) 
+    if (k <= GEQRF_GEQR2_SWITCHSIZE)
         return rocsolver_orgl2_template<T>(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count);
 
     //memory in GPU (workspace)
@@ -64,34 +64,34 @@ rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver
     // start of first blocked block
     rocblas_int jb = GEQRF_GEQR2_BLOCKSIZE;
     rocblas_int j = ((k - GEQRF_GEQR2_SWITCHSIZE - 1) / jb) * jb;
-    
+
     // start of the unblocked block
-    rocblas_int kk = min(k, j + jb); 
+    rocblas_int kk = min(k, j + jb);
 
     rocblas_int blocksy, blocksx;
-    
-    // compute the unblockled part and set to zero the 
+
+    // compute the unblockled part and set to zero the
     // corresponding left submatrix
     if (kk < m) {
         blocksx = (m - kk - 1)/32 + 1;
         blocksy = (kk - 1)/32 + 1;
         hipLaunchKernelGGL(set_zero_row<T>,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,
                            m,kk,A,shiftA,lda,strideA);
-        
-        rocsolver_orgl2_template<T>(handle, m - kk, n - kk, k - kk, 
-                                    A, shiftA + idx2D(kk, kk, lda), lda, 
+
+        rocsolver_orgl2_template<T>(handle, m - kk, n - kk, k - kk,
+                                    A, shiftA + idx2D(kk, kk, lda), lda,
                                     strideA, (ipiv + kk), strideP, batch_count);
     }
 
     // compute the blocked part
     while (j >= 0) {
-        
+
         // first update the already computed part
         // applying the current block reflector using larft + larfb
         if (j + jb < m) {
-            rocsolver_larft_template<T>(handle, rocsolver_forward_direction, 
-                                        rocsolver_row_wise, n-j, jb, 
-                                        A, shiftA + idx2D(j,j,lda), lda, strideA, 
+            rocsolver_larft_template<T>(handle, rocsolver_forward_direction,
+                                        rocsolver_row_wise, n-j, jb,
+                                        A, shiftA + idx2D(j,j,lda), lda, strideA,
                                         (ipiv + j), strideP,
                                         work, ldw, strideW, batch_count);
 
@@ -110,13 +110,13 @@ rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver
             hipLaunchKernelGGL(set_zero_row<T>,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,
                                j+jb,j,A,shiftA,lda,strideA);
         }
-        rocsolver_orgl2_template<T>(handle, jb, n - j, jb, 
-                                    A, shiftA + idx2D(j, j, lda), lda, 
+        rocsolver_orgl2_template<T>(handle, jb, n - j, jb,
+                                    A, shiftA + idx2D(j, j, lda), lda,
                                     strideA, (ipiv + j), strideP, batch_count);
 
         j -= jb;
     }
- 
+
     hipFree(work);
 
     return rocblas_status_success;
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgqr.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgqr.cpp
index ef11bd5e..7b1aceec 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgqr.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgqr.cpp
@@ -5,7 +5,7 @@
 #include "rocauxiliary_orgqr.hpp"
 
 template <typename T>
-rocblas_status rocsolver_orgqr_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_orgqr_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv)
 {
     if(!handle)
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgqr.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgqr.hpp
index 86386317..8079413c 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgqr.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orgqr.hpp
@@ -32,15 +32,15 @@ __global__ void set_zero_col(const rocblas_int n, const rocblas_int kk, U A,
 
     if (i < kk && j < n) {
         T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
-        
+
         Ap[i + j*lda] = 0.0;
     }
 }
 
 template <typename T, typename U>
-rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver_int m, 
-                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, 
-                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver_int m,
+                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA,
+                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, const rocsolver_int batch_count)
 {
     // quick return
@@ -49,9 +49,9 @@ rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver
 
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
-    
+
     // if the matrix is small, use the unblocked variant of the algorithm
-    if (k <= GEQRF_GEQR2_SWITCHSIZE) 
+    if (k <= GEQRF_GEQR2_SWITCHSIZE)
         return rocsolver_org2r_template<T>(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count);
 
     //memory in GPU (workspace)
@@ -63,34 +63,34 @@ rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver
     // start of first blocked block
     rocblas_int jb = GEQRF_GEQR2_BLOCKSIZE;
     rocblas_int j = ((k - GEQRF_GEQR2_SWITCHSIZE - 1) / jb) * jb;
-    
+
     // start of the unblocked block
-    rocblas_int kk = min(k, j + jb); 
+    rocblas_int kk = min(k, j + jb);
 
     rocblas_int blocksy, blocksx;
-    
-    // compute the unblockled part and set to zero the 
+
+    // compute the unblockled part and set to zero the
     // corresponding top submatrix
     if (kk < n) {
         blocksx = (kk - 1)/32 + 1;
         blocksy = (n- kk - 1)/32 + 1;
         hipLaunchKernelGGL(set_zero_col<T>,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,
                            n,kk,A,shiftA,lda,strideA);
-        
-        rocsolver_org2r_template<T>(handle, m - kk, n - kk, k - kk, 
-                                    A, shiftA + idx2D(kk, kk, lda), lda, 
+
+        rocsolver_org2r_template<T>(handle, m - kk, n - kk, k - kk,
+                                    A, shiftA + idx2D(kk, kk, lda), lda,
                                     strideA, (ipiv + kk), strideP, batch_count);
     }
 
     // compute the blocked part
     while (j >= 0) {
-        
+
         // first update the already computed part
         // applying the current block reflector using larft + larfb
         if (j + jb < n) {
-            rocsolver_larft_template<T>(handle, rocsolver_forward_direction, 
-                                        rocsolver_column_wise, m-j, jb, 
-                                        A, shiftA + idx2D(j,j,lda), lda, strideA, 
+            rocsolver_larft_template<T>(handle, rocsolver_forward_direction,
+                                        rocsolver_column_wise, m-j, jb,
+                                        A, shiftA + idx2D(j,j,lda), lda, strideA,
                                         (ipiv + j), strideP,
                                         work, ldw, strideW, batch_count);
 
@@ -109,13 +109,13 @@ rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver
             hipLaunchKernelGGL(set_zero_col<T>,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,
                                j+jb,j,A,shiftA,lda,strideA);
         }
-        rocsolver_org2r_template<T>(handle, m - j, jb, jb, 
-                                    A, shiftA + idx2D(j, j, lda), lda, 
+        rocsolver_org2r_template<T>(handle, m - j, jb, jb,
+                                    A, shiftA + idx2D(j, j, lda), lda,
                                     strideA, (ipiv + j), strideP, batch_count);
 
         j -= jb;
     }
- 
+
     hipFree(work);
 
     return rocblas_status_success;
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orm2r.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orm2r.cpp
index 34ee185b..fdaa1724 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orm2r.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orm2r.cpp
@@ -5,8 +5,8 @@
 #include "rocauxiliary_orm2r.hpp"
 
 template <typename T>
-rocblas_status rocsolver_orm2r_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, 
-                                   const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_orm2r_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans,
+                                   const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv, T *C, const rocsolver_int ldc)
 {
     if(!handle)
@@ -35,7 +35,7 @@ rocblas_status rocsolver_orm2r_impl(rocsolver_handle handle, const rocsolver_sid
                                       strideA,
                                       ipiv,
                                       strideP,
-                                      C,0,  
+                                      C,0,
                                       ldc,
                                       strideC,
                                       batch_count);
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orm2r.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orm2r.hpp
index 10522f08..dd83c375 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orm2r.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_orm2r.hpp
@@ -18,10 +18,10 @@
 #include "../auxiliary/rocauxiliary_larf.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, 
-                                   const rocsolver_int m, const rocsolver_int n, 
-                                   const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda, 
-                                   const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans,
+                                   const rocsolver_int m, const rocsolver_int n,
+                                   const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda,
+                                   const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, U C, const rocsolver_int shiftC, const rocsolver_int ldc,
                                    const rocsolver_int strideC, const rocsolver_int batch_count)
 {
@@ -72,14 +72,14 @@ rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver
             ncol = n - i;
             jc = i;
         }
-    
-        // insert one in A(i,i) tobuild/apply the householder matrix 
+
+        // insert one in A(i,i) tobuild/apply the householder matrix
         hipLaunchKernelGGL(set_one_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(i,i,lda),strideA);
 
-        // Apply current Householder reflector 
+        // Apply current Householder reflector
         rocsolver_larf_template(handle,side,                        //side
                                 nrow,                               //number of rows of matrix to modify
-                                ncol,                               //number of columns of matrix to modify    
+                                ncol,                               //number of columns of matrix to modify
                                 A, shiftA + idx2D(i,i,lda),         //householder vector x
                                 1, strideA,                         //inc of x
                                 (ipiv + i), strideP,                //householder scalar (alpha)
@@ -90,7 +90,7 @@ rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver
         // restore original value of A(i,i)
         hipLaunchKernelGGL(restore_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(i,i,lda),strideA);
     }
- 
+
     return rocblas_status_success;
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_ormqr.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_ormqr.cpp
index 7d11d5e6..820f4a46 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_ormqr.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_ormqr.cpp
@@ -5,8 +5,8 @@
 #include "rocauxiliary_ormqr.hpp"
 
 template <typename T>
-rocblas_status rocsolver_ormqr_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, 
-                                   const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_ormqr_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans,
+                                   const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv, T *C, const rocsolver_int ldc)
 {
     if(!handle)
@@ -35,7 +35,7 @@ rocblas_status rocsolver_ormqr_impl(rocsolver_handle handle, const rocsolver_sid
                                       strideA,
                                       ipiv,
                                       strideP,
-                                      C,0,  
+                                      C,0,
                                       ldc,
                                       strideC,
                                       batch_count);
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_ormqr.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_ormqr.hpp
index fd0b523c..b24d77cd 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_ormqr.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/auxiliary/rocauxiliary_ormqr.hpp
@@ -20,10 +20,10 @@
 #include "../auxiliary/rocauxiliary_larft.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, 
-                                   const rocsolver_int m, const rocsolver_int n, 
-                                   const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda, 
-                                   const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans,
+                                   const rocsolver_int m, const rocsolver_int n,
+                                   const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda,
+                                   const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, U C, const rocsolver_int shiftC, const rocsolver_int ldc,
                                    const rocsolver_int strideC, const rocsolver_int batch_count)
 {
@@ -35,14 +35,14 @@ rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver
     rocblas_get_stream(handle, &stream);
 
     // if the matrix is small, use the unblocked variant of the algorithm
-    if (k <= ORMQR_ORM2R_BLOCKSIZE) 
+    if (k <= ORMQR_ORM2R_BLOCKSIZE)
         return rocsolver_orm2r_template<T>(handle, side, trans, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, C, shiftC, ldc, strideC, batch_count);
 
     //memory in GPU (workspace)
     T* work;
     rocblas_int ldw = ORMQR_ORM2R_BLOCKSIZE;
     rocblas_int strideW = ldw *ldw;
-    hipMalloc(&work, sizeof(T)*strideW*batch_count);    
+    hipMalloc(&work, sizeof(T)*strideW*batch_count);
 
     // determine limits and indices
     bool left = (side == rocblas_side_left);
@@ -100,7 +100,7 @@ rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver
                                  C, shiftC + idx2D(ic,jc,ldc),ldc,strideC,
                                  batch_count);
     }
- 
+
     return rocblas_status_success;
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/common/rocblas.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/common/rocblas.cpp
index 2d57c7d9..65dd0697 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/common/rocblas.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/common/rocblas.cpp
@@ -104,7 +104,7 @@ rocblas_status rocblas_iamax(rocblas_handle handle, rocblas_int n,
   return rocblas_izamax(handle, n, x, incx, result);
 }
 
-//ger 
+//ger
 
 template <>
 rocblas_status rocblas_ger<false>(rocblas_handle handle, rocblas_int m, rocblas_int n,
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/include/common_device.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/include/common_device.hpp
index 1aaaab61..d28acb79 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/include/common_device.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/include/common_device.hpp
@@ -36,16 +36,16 @@ __forceinline__ __device__ __host__ T* load_ptr_batch(T *const p[], rocblas_int
 }
 
 template<typename T>
-__forceinline__ __global__ void get_array(T** out, T* in, rocblas_int stride, rocblas_int batch) 
+__forceinline__ __global__ void get_array(T** out, T* in, rocblas_int stride, rocblas_int batch)
 {
     int b = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-    
+
     if (b < batch)
         out[b] = in + b*stride;
 }
 
 template <typename T, typename U>
-__forceinline__ __global__ void setdiag(const rocblas_int j, U A, 
+__forceinline__ __global__ void setdiag(const rocblas_int j, U A,
                         const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA,
                         T *ipiv, const rocblas_int strideP)
 {
@@ -54,7 +54,7 @@ __forceinline__ __global__ void setdiag(const rocblas_int j, U A,
     T *tau = ipiv + b*strideP;
 
     T t = -tau[j];
-    tau[j] = t; 
+    tau[j] = t;
     Ap[j + j*lda] = 1.0 + t;
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/include/ideal_sizes.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/include/ideal_sizes.hpp
index 5d9cf574..260d9d1f 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/include/ideal_sizes.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/include/ideal_sizes.hpp
@@ -8,7 +8,7 @@
 
 // IDEAL SIZES ARE DEFINED FOR NOW AS IN CPU-LAPACK
 // BENCHMARKING OF ROCSOLVER WILL BE NEEDED TO DETERMINE
-// MORE SUITABLE VALUES  
+// MORE SUITABLE VALUES
 
 
 
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/include/rocsolver_unique_ptr.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/include/rocsolver_unique_ptr.hpp
index 185d1690..b7e34f6b 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/include/rocsolver_unique_ptr.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/include/rocsolver_unique_ptr.hpp
@@ -1,24 +1,24 @@
-/* ************************************************************************
- * Copyright 2019-2020 Advanced Micro Devices, Inc.
- * ************************************************************************ */
-
-#ifndef GUARD_ROCBLAS_MANAGE_PTR_HPP
-#define GUARD_ROCBLAS_MANAGE_PTR_HPP
-
-#include <memory>
-
-namespace rocsolver {
-// device_malloc wraps hipMalloc and provides same API as malloc
-static void *device_malloc(size_t byte_size) {
-  void *pointer;
-  PRINT_IF_HIP_ERROR(hipMalloc(&pointer, byte_size));
-  return pointer;
-}
-
-// device_free wraps hipFree and provides same API as free
-static void device_free(void *ptr) { PRINT_IF_HIP_ERROR(hipFree(ptr)); }
-} // namespace rocsolver
-
-using rocsolver_unique_ptr = std::unique_ptr<void, void (*)(void *)>;
-
-#endif
+/* ************************************************************************
+ * Copyright 2019-2020 Advanced Micro Devices, Inc.
+ * ************************************************************************ */
+
+#ifndef GUARD_ROCBLAS_MANAGE_PTR_HPP
+#define GUARD_ROCBLAS_MANAGE_PTR_HPP
+
+#include <memory>
+
+namespace rocsolver {
+// device_malloc wraps hipMalloc and provides same API as malloc
+static void *device_malloc(size_t byte_size) {
+  void *pointer;
+  PRINT_IF_HIP_ERROR(hipMalloc(&pointer, byte_size));
+  return pointer;
+}
+
+// device_free wraps hipFree and provides same API as free
+static void device_free(void *ptr) { PRINT_IF_HIP_ERROR(hipFree(ptr)); }
+} // namespace rocsolver
+
+using rocsolver_unique_ptr = std::unique_ptr<void, void (*)(void *)>;
+
+#endif
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2.cpp
index d412d69a..f5f6d466 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_gelq2_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv) 
-{ 
+                                        T* ipiv)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m)
@@ -41,13 +41,13 @@ rocblas_status rocsolver_gelq2_impl(rocblas_handle handle, const rocblas_int m,
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgelq2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, float *ipiv) 
+                 const rocblas_int lda, float *ipiv)
 {
     return rocsolver_gelq2_impl<float>(handle, m, n, A, lda, ipiv);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgelq2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, double *ipiv) 
+                 const rocblas_int lda, double *ipiv)
 {
     return rocsolver_gelq2_impl<double>(handle, m, n, A, lda, ipiv);
 }
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2.hpp
index 29c4266f..81ec19ae 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2.hpp
@@ -22,12 +22,12 @@
 
 template <typename T, typename U>
 rocblas_status rocsolver_gelq2_template(rocblas_handle handle, const rocblas_int m,
-                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, 
-                                        rocblas_int const strideA, T* ipiv,  
+                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda,
+                                        rocblas_int const strideA, T* ipiv,
                                         const rocblas_int strideP, const rocblas_int batch_count)
 {
     // quick return
-    if (m == 0 || n == 0 || batch_count == 0) 
+    if (m == 0 || n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     hipStream_t stream;
@@ -36,8 +36,8 @@ rocblas_status rocsolver_gelq2_template(rocblas_handle handle, const rocblas_int
     //memory in GPU (workspace)
     T *diag;
     hipMalloc(&diag,sizeof(T)*batch_count);
-   
-    rocblas_int dim = min(m, n);    //total number of pivots    
+
+    rocblas_int dim = min(m, n);    //total number of pivots
 
     for (rocblas_int j = 0; j < dim; ++j) {
         // generate Householder reflector to work on row j
@@ -45,18 +45,18 @@ rocblas_status rocsolver_gelq2_template(rocblas_handle handle, const rocblas_int
                                  n - j,                                 //order of reflector
                                  A, shiftA + idx2D(j,j,lda),            //value of alpha
                                  A, shiftA + idx2D(j,min(j+1,n-1),lda), //vector x to work on
-                                 lda, strideA,                          //inc of x    
+                                 lda, strideA,                          //inc of x
                                  (ipiv + j), strideP,                   //tau
                                  batch_count);
 
-        // insert one in A(j,j) tobuild/apply the householder matrix 
+        // insert one in A(j,j) tobuild/apply the householder matrix
         hipLaunchKernelGGL(set_one_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(j,j,lda),strideA);
 
-        // Apply Householder reflector to the rest of matrix from the right 
+        // Apply Householder reflector to the rest of matrix from the right
         if (j < m - 1) {
             rocsolver_larf_template(handle,rocblas_side_right,          //side
                                     m - j - 1,                          //number of rows of matrix to modify
-                                    n - j,                              //number of columns of matrix to modify    
+                                    n - j,                              //number of columns of matrix to modify
                                     A, shiftA + idx2D(j,j,lda),         //householder vector x
                                     lda, strideA,                       //inc of x
                                     (ipiv + j), strideP,                //householder scalar (alpha)
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2_batched.cpp
index 027572df..35fe7af5 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2_batched.cpp
@@ -8,13 +8,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_gelq2_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -40,13 +40,13 @@ rocblas_status rocsolver_gelq2_batched_impl(rocblas_handle handle, const rocblas
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgelq2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[],
-                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelq2_batched_impl<float>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgelq2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[],
-                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelq2_batched_impl<double>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2_strided_batched.cpp
index 9eefcb03..569facbb 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelq2_strided_batched.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_gelq2_strided_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -38,13 +38,13 @@ rocblas_status rocsolver_gelq2_strided_batched_impl(rocblas_handle handle, const
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgelq2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelq2_strided_batched_impl<float>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgelq2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelq2_strided_batched_impl<double>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf.cpp
index a29c5b0f..f75a0da7 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_gelqf_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv) 
-{ 
+                                        T* ipiv)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m)
@@ -41,13 +41,13 @@ rocblas_status rocsolver_gelqf_impl(rocblas_handle handle, const rocblas_int m,
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgelqf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, float *ipiv) 
+                 const rocblas_int lda, float *ipiv)
 {
     return rocsolver_gelqf_impl<float>(handle, m, n, A, lda, ipiv);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgelqf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, double *ipiv) 
+                 const rocblas_int lda, double *ipiv)
 {
     return rocsolver_gelqf_impl<double>(handle, m, n, A, lda, ipiv);
 }
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf.hpp
index b0e15bef..d40b9dd5 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf.hpp
@@ -24,21 +24,21 @@
 
 template <typename T, typename U>
 rocblas_status rocsolver_gelqf_template(rocblas_handle handle, const rocblas_int m,
-                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, 
-                                        rocblas_int const strideA, T* ipiv,  
+                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda,
+                                        rocblas_int const strideA, T* ipiv,
                                         const rocblas_int strideP, const rocblas_int batch_count)
 {
     // quick return
-    if (m == 0 || n == 0 || batch_count == 0) 
+    if (m == 0 || n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
 
     // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm
-    if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE) 
+    if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE)
         return rocsolver_gelq2_template<T>(handle, m, n, A, shiftA, lda, strideA, ipiv, strideP, batch_count);
-    
+
     rocblas_int dim = min(m, n);    //total number of pivots
     rocblas_int jb, j = 0;
 
@@ -49,17 +49,17 @@ rocblas_status rocsolver_gelqf_template(rocblas_handle handle, const rocblas_int
     hipMalloc(&work, sizeof(T)*strideW*batch_count);
 
     while (j < dim - GEQRF_GEQR2_SWITCHSIZE) {
-        // Factor diagonal and subdiagonal blocks 
+        // Factor diagonal and subdiagonal blocks
         jb = min(dim - j, GEQRF_GEQR2_BLOCKSIZE);  //number of rows in the block
         rocsolver_gelq2_template<T>(handle, jb, n-j, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count);
 
         //apply transformation to the rest of the matrix
         if (j + jb < m) {
-            
+
             //compute block reflector
-            rocsolver_larft_template<T>(handle, rocsolver_forward_direction, 
-                                        rocsolver_row_wise, n-j, jb, 
-                                        A, shiftA + idx2D(j,j,lda), lda, strideA, 
+            rocsolver_larft_template<T>(handle, rocsolver_forward_direction,
+                                        rocsolver_row_wise, n-j, jb,
+                                        A, shiftA + idx2D(j,j,lda), lda, strideA,
                                         (ipiv + j), strideP,
                                         work, ldw, strideW, batch_count);
 
@@ -76,9 +76,9 @@ rocblas_status rocsolver_gelqf_template(rocblas_handle handle, const rocblas_int
     }
 
     //factor last block
-    if (j < dim) 
+    if (j < dim)
         rocsolver_gelq2_template<T>(handle, m-j, n-j, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count);
-        
+
     hipFree(work);
 
     return rocblas_status_success;
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf_batched.cpp
index 91631008..cee74932 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf_batched.cpp
@@ -8,13 +8,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_gelqf_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -40,13 +40,13 @@ rocblas_status rocsolver_gelqf_batched_impl(rocblas_handle handle, const rocblas
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgelqf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[],
-                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelqf_batched_impl<float>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgelqf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[],
-                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelqf_batched_impl<double>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf_strided_batched.cpp
index 13e0312f..a5581819 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_gelqf_strided_batched.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_gelqf_strided_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -38,13 +38,13 @@ rocblas_status rocsolver_gelqf_strided_batched_impl(rocblas_handle handle, const
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgelqf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelqf_strided_batched_impl<float>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgelqf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelqf_strided_batched_impl<double>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2.cpp
index 0cae47b0..249784a0 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_geqr2_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv) 
-{ 
+                                        T* ipiv)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m)
@@ -41,13 +41,13 @@ rocblas_status rocsolver_geqr2_impl(rocblas_handle handle, const rocblas_int m,
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqr2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, float *ipiv) 
+                 const rocblas_int lda, float *ipiv)
 {
     return rocsolver_geqr2_impl<float>(handle, m, n, A, lda, ipiv);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqr2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, double *ipiv) 
+                 const rocblas_int lda, double *ipiv)
 {
     return rocsolver_geqr2_impl<double>(handle, m, n, A, lda, ipiv);
 }
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2.hpp
index 668fc8a0..485550d7 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2.hpp
@@ -22,12 +22,12 @@
 
 template <typename T, typename U>
 rocblas_status rocsolver_geqr2_template(rocblas_handle handle, const rocblas_int m,
-                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, 
-                                        rocblas_int const strideA, T* ipiv,  
+                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda,
+                                        rocblas_int const strideA, T* ipiv,
                                         const rocblas_int strideP, const rocblas_int batch_count)
 {
     // quick return
-    if (m == 0 || n == 0 || batch_count == 0) 
+    if (m == 0 || n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     hipStream_t stream;
@@ -36,8 +36,8 @@ rocblas_status rocsolver_geqr2_template(rocblas_handle handle, const rocblas_int
     //memory in GPU (workspace)
     T *diag;
     hipMalloc(&diag,sizeof(T)*batch_count);
-   
-    rocblas_int dim = min(m, n);    //total number of pivots    
+
+    rocblas_int dim = min(m, n);    //total number of pivots
 
     for (rocblas_int j = 0; j < dim; ++j) {
         // generate Householder reflector to work on column j
@@ -45,18 +45,18 @@ rocblas_status rocsolver_geqr2_template(rocblas_handle handle, const rocblas_int
                                  m - j,                                 //order of reflector
                                  A, shiftA + idx2D(j,j,lda),            //value of alpha
                                  A, shiftA + idx2D(min(j+1,m-1),j,lda), //vector x to work on
-                                 1, strideA,                            //inc of x    
+                                 1, strideA,                            //inc of x
                                  (ipiv + j), strideP,                   //tau
                                  batch_count);
 
-        // insert one in A(j,j) tobuild/apply the householder matrix 
+        // insert one in A(j,j) tobuild/apply the householder matrix
         hipLaunchKernelGGL(set_one_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(j,j,lda),strideA);
 
-        // Apply Householder reflector to the rest of matrix from the left 
+        // Apply Householder reflector to the rest of matrix from the left
         if (j < n - 1) {
             rocsolver_larf_template(handle,rocblas_side_left,           //side
                                     m - j,                              //number of rows of matrix to modify
-                                    n - j - 1,                          //number of columns of matrix to modify    
+                                    n - j - 1,                          //number of columns of matrix to modify
                                     A, shiftA + idx2D(j,j,lda),         //householder vector x
                                     1, strideA,                         //inc of x
                                     (ipiv + j), strideP,                //householder scalar (alpha)
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2_batched.cpp
index ef67a2eb..70e765e8 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2_batched.cpp
@@ -8,13 +8,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_geqr2_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -40,13 +40,13 @@ rocblas_status rocsolver_geqr2_batched_impl(rocblas_handle handle, const rocblas
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqr2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[],
-                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqr2_batched_impl<float>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqr2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[],
-                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqr2_batched_impl<double>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2_strided_batched.cpp
index 26816634..e468de7e 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqr2_strided_batched.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_geqr2_strided_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -38,13 +38,13 @@ rocblas_status rocsolver_geqr2_strided_batched_impl(rocblas_handle handle, const
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqr2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqr2_strided_batched_impl<float>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqr2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqr2_strided_batched_impl<double>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf.cpp
index d941c762..b91aa412 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_geqrf_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv) 
-{ 
+                                        T* ipiv)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m)
@@ -41,13 +41,13 @@ rocblas_status rocsolver_geqrf_impl(rocblas_handle handle, const rocblas_int m,
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqrf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, float *ipiv) 
+                 const rocblas_int lda, float *ipiv)
 {
     return rocsolver_geqrf_impl<float>(handle, m, n, A, lda, ipiv);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqrf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, double *ipiv) 
+                 const rocblas_int lda, double *ipiv)
 {
     return rocsolver_geqrf_impl<double>(handle, m, n, A, lda, ipiv);
 }
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf.hpp
index fcdb4935..e1a3adaf 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf.hpp
@@ -24,21 +24,21 @@
 
 template <typename T, typename U>
 rocblas_status rocsolver_geqrf_template(rocblas_handle handle, const rocblas_int m,
-                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, 
-                                        rocblas_int const strideA, T* ipiv,  
+                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda,
+                                        rocblas_int const strideA, T* ipiv,
                                         const rocblas_int strideP, const rocblas_int batch_count)
 {
     // quick return
-    if (m == 0 || n == 0 || batch_count == 0) 
+    if (m == 0 || n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
 
     // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm
-    if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE) 
+    if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE)
         return rocsolver_geqr2_template<T>(handle, m, n, A, shiftA, lda, strideA, ipiv, strideP, batch_count);
-    
+
     rocblas_int dim = min(m, n);    //total number of pivots
     rocblas_int jb, j = 0;
 
@@ -49,17 +49,17 @@ rocblas_status rocsolver_geqrf_template(rocblas_handle handle, const rocblas_int
     hipMalloc(&work, sizeof(T)*strideW*batch_count);
 
     while (j < dim - GEQRF_GEQR2_SWITCHSIZE) {
-        // Factor diagonal and subdiagonal blocks 
+        // Factor diagonal and subdiagonal blocks
         jb = min(dim - j, GEQRF_GEQR2_BLOCKSIZE);  //number of columns in the block
         rocsolver_geqr2_template<T>(handle, m-j, jb, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count);
 
         //apply transformation to the rest of the matrix
         if (j + jb < n) {
-            
+
             //compute block reflector
-            rocsolver_larft_template<T>(handle, rocsolver_forward_direction, 
-                                        rocsolver_column_wise, m-j, jb, 
-                                        A, shiftA + idx2D(j,j,lda), lda, strideA, 
+            rocsolver_larft_template<T>(handle, rocsolver_forward_direction,
+                                        rocsolver_column_wise, m-j, jb,
+                                        A, shiftA + idx2D(j,j,lda), lda, strideA,
                                         (ipiv + j), strideP,
                                         work, ldw, strideW, batch_count);
 
@@ -75,9 +75,9 @@ rocblas_status rocsolver_geqrf_template(rocblas_handle handle, const rocblas_int
     }
 
     //factor last block
-    if (j < dim) 
+    if (j < dim)
         rocsolver_geqr2_template<T>(handle, m-j, n-j, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count);
-        
+
     hipFree(work);
 
     return rocblas_status_success;
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf_batched.cpp
index 3ae16e6a..41bb01e6 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf_batched.cpp
@@ -8,13 +8,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_geqrf_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -40,13 +40,13 @@ rocblas_status rocsolver_geqrf_batched_impl(rocblas_handle handle, const rocblas
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqrf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[],
-                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqrf_batched_impl<float>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqrf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[],
-                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqrf_batched_impl<double>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf_strided_batched.cpp
index b3e3809d..bd670e1f 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_geqrf_strided_batched.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_geqrf_strided_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -38,13 +38,13 @@ rocblas_status rocsolver_geqrf_strided_batched_impl(rocblas_handle handle, const
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqrf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqrf_strided_batched_impl<float>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqrf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqrf_strided_batched_impl<double>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2.cpp
index 9b01a5af..d74da116 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_getf2_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        rocblas_int *ipiv, rocblas_int* info) 
-{ 
+                                        rocblas_int *ipiv, rocblas_int* info)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv || !info)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || lda < 1)
@@ -41,25 +41,25 @@ rocblas_status rocsolver_getf2_impl(rocblas_handle handle, const rocblas_int m,
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info) 
+                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info)
 {
     return rocsolver_getf2_impl<float>(handle, m, n, A, lda, ipiv, info);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info ) 
+                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info )
 {
     return rocsolver_getf2_impl<double>(handle, m, n, A, lda, ipiv, info);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_cgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *A,
-                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info) 
+                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info)
 {
     return rocsolver_getf2_impl<rocblas_float_complex>(handle, m, n, A, lda, ipiv, info);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *A,
-                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info ) 
+                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info )
 {
     return rocsolver_getf2_impl<rocblas_double_complex>(handle, m, n, A, lda, ipiv, info);
 }
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2.hpp
index 727a76c3..5630004e 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2.hpp
@@ -44,14 +44,14 @@ inline __global__ void getf2_check_singularity(U AA, const rocblas_int shiftA, c
 
 template <typename T, typename U>
 rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int m,
-                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, 
-                                        rocblas_int const strideA, rocblas_int *ipiv, const rocblas_int shiftP, 
+                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda,
+                                        rocblas_int const strideA, rocblas_int *ipiv, const rocblas_int shiftP,
                                         const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     // quick return
-    if (m == 0 || n == 0 || batch_count == 0) 
+    if (m == 0 || n == 0 || batch_count == 0)
         return rocblas_status_success;
-    
+
     #ifdef batched
         // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL
         //      BATCH-BLAS FUNCTIONALITY IS ENABLED. ****
@@ -69,7 +69,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int
     hipMemcpy(minoneInt, &minone, sizeof(T), hipMemcpyHostToDevice);
 
     //pivoting info in device (to avoid continuous synchronization with CPU)
-    T *pivotGPU; 
+    T *pivotGPU;
     hipMalloc(&pivotGPU, sizeof(T)*batch_count);
 
     hipStream_t stream;
@@ -84,7 +84,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int
 
     //info=0 (starting with a nonsingular matrix)
     hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,info,batch_count,0);
-    
+
     // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS
     //      FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
@@ -93,7 +93,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int
         // find pivot. Use Fortran 1-based indexing for the ipiv array as iamax does that as well!
         for (int b=0;b<batch_count;++b) {
             M = load_ptr_batch<T>(AA,shiftA,b,strideA);
-            rocblas_iamax(handle, m - j, (M + idx2D(j, j, lda)), 1, 
+            rocblas_iamax(handle, m - j, (M + idx2D(j, j, lda)), 1,
                         (ipiv + shiftP + b*strideP + j));
         }
 
@@ -101,14 +101,14 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int
         hipLaunchKernelGGL(getf2_check_singularity<T>, dim3(batch_count), dim3(1), 0, stream,
                   A, shiftA, strideA, ipiv, shiftP, strideP, j, lda, pivotGPU, info);
 
-        // Swap pivot row and j-th row 
+        // Swap pivot row and j-th row
         rocsolver_laswp_template<T>(handle, n, A, shiftA, lda, strideA, j+1, j+1, ipiv, shiftP, strideP, 1, batch_count);
 
         // Compute elements J+1:M of J'th column
         for (int b=0;b<batch_count;++b) {
             M = load_ptr_batch<T>(AA,shiftA,b,strideA);
-            rocblas_scal(handle, (m-j-1), (pivotGPU + b), 
-                            (M + idx2D(j + 1, j, lda)), oneInt); 
+            rocblas_scal(handle, (m-j-1), (pivotGPU + b),
+                            (M + idx2D(j + 1, j, lda)), oneInt);
         }
 
         // update trailing submatrix
@@ -116,7 +116,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int
             for (int b=0;b<batch_count;++b) {
                 M = load_ptr_batch<T>(AA,shiftA,b,strideA);
                 rocblas_ger<false>(handle, m - j - 1, n - j - 1, minoneInt,
-                        (M + idx2D(j + 1, j, lda)), oneInt, 
+                        (M + idx2D(j + 1, j, lda)), oneInt,
                         (M + idx2D(j, j + 1, lda)), lda,
                         (M + idx2D(j + 1, j + 1, lda)), lda);
             }
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2_batched.cpp
index bd9e7240..462e932d 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2_batched.cpp
@@ -8,14 +8,14 @@
 template <typename T, typename U>
 rocblas_status rocsolver_getf2_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
-{ 
+                                        rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
+{
 
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv || !info)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -40,25 +40,25 @@ rocblas_status rocsolver_getf2_batched_impl(rocblas_handle handle, const rocblas
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[],
-                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_batched_impl<float>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[],
-                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_batched_impl<double>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_cgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *const A[],
-                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_batched_impl<rocblas_float_complex>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *const A[],
-                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_batched_impl<rocblas_double_complex>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2_strided_batched.cpp
index ccb2d252..b3ea05e9 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getf2_strided_batched.cpp
@@ -7,19 +7,19 @@
 template <typename T, typename U>
 rocblas_status rocsolver_getf2_strided_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
-                                        rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
-{ 
+                                        rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
+{
 
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv || !info)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
         return rocblas_status_invalid_size;
-        
+
 
     return rocsolver_getf2_template<T>(handle,m,n,
                                     A,0,    //the matrix is shifted 0 entries (will work on the entire matrix)
@@ -39,25 +39,25 @@ rocblas_status rocsolver_getf2_strided_batched_impl(rocblas_handle handle, const
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_strided_batched_impl<float>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_strided_batched_impl<double>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_cgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *A,
-                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_strided_batched_impl<rocblas_float_complex>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *A,
-                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_strided_batched_impl<rocblas_double_complex>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf.cpp
index 4a1c1b91..9b3bdf70 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf.cpp
@@ -10,10 +10,10 @@ rocblas_status rocsolver_getrf_impl(rocblas_handle handle, const rocblas_int m,
                                         rocblas_int *ipiv, rocblas_int* info) {
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
 
-    if (m < 0 || n < 0 || lda < m) 
+    //logging is missing ???
+
+    if (m < 0 || n < 0 || lda < m)
         return rocblas_status_invalid_size;
     if (!A || !ipiv || !info)
         return rocblas_status_invalid_pointer;
@@ -40,25 +40,25 @@ rocblas_status rocsolver_getrf_impl(rocblas_handle handle, const rocblas_int m,
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 float *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) 
+                 float *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info)
 {
     return rocsolver_getrf_impl<float>(handle, m, n, A, lda, ipiv, info);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 double *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) 
+                 double *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info)
 {
     return rocsolver_getrf_impl<double>(handle, m, n, A, lda, ipiv, info);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_cgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 rocblas_float_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) 
+                 rocblas_float_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info)
 {
     return rocsolver_getrf_impl<rocblas_float_complex>(handle, m, n, A, lda, ipiv, info);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 rocblas_double_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) 
+                 rocblas_double_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info)
 {
     return rocsolver_getrf_impl<rocblas_double_complex>(handle, m, n, A, lda, ipiv, info);
 }
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf.hpp
index f19138bb..395fd187 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf.hpp
@@ -41,13 +41,13 @@ rocblas_status rocsolver_getrf_template(rocblas_handle handle, const rocblas_int
                                         const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA,
                                         rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int *info, const rocblas_int batch_count) {
     // quick return
-    if (m == 0 || n == 0 || batch_count == 0) 
+    if (m == 0 || n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm
-    if (m < GETRF_GETF2_SWITCHSIZE || n < GETRF_GETF2_SWITCHSIZE) 
+    if (m < GETRF_GETF2_SWITCHSIZE || n < GETRF_GETF2_SWITCHSIZE)
         return rocsolver_getf2_template<T>(handle, m, n, A, shiftA, lda, strideA, ipiv, shiftP, strideP, info, batch_count);
-  
+
     #ifdef batched
         // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL
         //      BATCH-BLAS FUNCTIONALITY IS ENABLED. ****
@@ -92,14 +92,14 @@ rocblas_status rocsolver_getrf_template(rocblas_handle handle, const rocblas_int
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
 
     for (int j = 0; j < dim; j += GETRF_GETF2_SWITCHSIZE) {
-        // Factor diagonal and subdiagonal blocks 
+        // Factor diagonal and subdiagonal blocks
         jb = min(dim - j, GETRF_GETF2_SWITCHSIZE);  //number of columns in the block
         hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,iinfo,batch_count,0);
         rocsolver_getf2_template<T>(handle, m - j, jb, A, shiftA + idx2D(j, j, lda), lda, strideA, ipiv, shiftP + j, strideP, iinfo, batch_count);
-        
+
         // adjust pivot indices and check singularity
         sizePivot = min(m - j, jb);     //number of pivots in the block
-        blocksPivot = (sizePivot - 1) / GETF2_BLOCKSIZE + 1; 
+        blocksPivot = (sizePivot - 1) / GETF2_BLOCKSIZE + 1;
         gridPivot = dim3(blocksPivot, batch_count, 1);
         hipLaunchKernelGGL(getrf_check_singularity, gridPivot, threads, 0, stream, sizePivot, j, ipiv, shiftP + j, strideP, iinfo, info);
 
@@ -131,7 +131,7 @@ rocblas_status rocsolver_getrf_template(rocblas_handle handle, const rocblas_int
                                  (M + idx2D(j + jb, j + jb, lda)), lda);
                 }
             }
-        } 
+        }
     }
 
     hipFree(pivotGPU);
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf_batched.cpp
index 5ed946d0..44317213 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf_batched.cpp
@@ -7,14 +7,14 @@
 
 template <typename T, typename U>
 rocblas_status rocsolver_getrf_batched_impl(rocblas_handle handle, rocblas_int m,
-                                        rocblas_int n, U A, rocblas_int lda, 
+                                        rocblas_int n, U A, rocblas_int lda,
                                         rocblas_int *ipiv, const rocblas_int strideP, rocblas_int* info, rocblas_int batch_count) {
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
 
-    if (m < 0 || n < 0 || batch_count < 0 || lda < m) 
+    //logging is missing ???
+
+    if (m < 0 || n < 0 || batch_count < 0 || lda < m)
         return rocblas_status_invalid_size;
     if (!A || !ipiv || !info)
         return rocblas_status_invalid_pointer;
@@ -39,25 +39,25 @@ rocblas_status rocsolver_getrf_batched_impl(rocblas_handle handle, rocblas_int m
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 float *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 float *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getrf_batched_impl<float>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 double *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count) 
+                 double *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count)
 {
     return rocsolver_getrf_batched_impl<double>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_cgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 rocblas_float_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 rocblas_float_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getrf_batched_impl<rocblas_float_complex>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 rocblas_double_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count) 
+                 rocblas_double_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count)
 {
     return rocsolver_getrf_batched_impl<rocblas_double_complex>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf_strided_batched.cpp
index c1ef590b..35443146 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrf_strided_batched.cpp
@@ -10,10 +10,10 @@ rocblas_status rocsolver_getrf_strided_batched_impl(rocblas_handle handle, const
                                         rocblas_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) {
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
 
-    if (m < 0 || n < 0 || batch_count  < 0 || lda < m) 
+    //logging is missing ???
+
+    if (m < 0 || n < 0 || batch_count  < 0 || lda < m)
         return rocblas_status_invalid_size;
     if (!A || !ipiv || !info)
         return rocblas_status_invalid_pointer;
@@ -36,25 +36,25 @@ rocblas_status rocsolver_getrf_strided_batched_impl(rocblas_handle handle, const
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 float *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 float *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getrf_strided_batched_impl<float>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 double *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 double *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getrf_strided_batched_impl<double>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_cgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 rocblas_float_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 rocblas_float_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getrf_strided_batched_impl<rocblas_float_complex>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 rocblas_double_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 rocblas_double_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getrf_strided_batched_impl<rocblas_double_complex>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs.cpp
index 255e306c..435339c1 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs.cpp
@@ -7,14 +7,14 @@
 template <typename T>
 rocblas_status rocsolver_getrs_impl(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, T *A, const rocblas_int lda,
-                 const rocblas_int *ipiv, T *B, const rocblas_int ldb) 
+                 const rocblas_int *ipiv, T *B, const rocblas_int ldb)
 {
     if(!handle)
         return rocblas_status_invalid_handle;
 
-    //logging is missing ???    
+    //logging is missing ???
 
-    if (n < 0 || nrhs < 0 || lda < n || ldb < n) 
+    if (n < 0 || nrhs < 0 || lda < n || ldb < n)
         return rocblas_status_invalid_size;
 
     if (!A || !ipiv || !B)
@@ -45,7 +45,7 @@ rocblas_status rocsolver_getrs_impl(rocblas_handle handle, const rocblas_operati
 extern "C" ROCSOLVER_EXPORT rocblas_status
 rocsolver_sgetrs(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, float *A, const rocblas_int lda,
-                 const rocblas_int *ipiv, float *B, const rocblas_int ldb) 
+                 const rocblas_int *ipiv, float *B, const rocblas_int ldb)
 {
   return rocsolver_getrs_impl<float>(handle, trans, n, nrhs, A, lda, ipiv, B, ldb);
 }
@@ -53,21 +53,21 @@ rocsolver_sgetrs(rocblas_handle handle, const rocblas_operation trans, const roc
 extern "C" ROCSOLVER_EXPORT rocblas_status
 rocsolver_dgetrs(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, double *A, const rocblas_int lda,
-                 const rocblas_int *ipiv, double *B, const rocblas_int ldb) 
+                 const rocblas_int *ipiv, double *B, const rocblas_int ldb)
 {
   return rocsolver_getrs_impl<double>(handle, trans, n, nrhs, A, lda, ipiv, B, ldb);
 }
 
-extern "C" ROCSOLVER_EXPORT rocsolver_status 
+extern "C" ROCSOLVER_EXPORT rocsolver_status
 rocsolver_cgetrs(
     rocsolver_handle handle, const rocsolver_operation trans, const rocsolver_int n,
     const rocsolver_int nrhs, rocblas_float_complex *A, const rocsolver_int lda,
-    const rocsolver_int *ipiv, rocblas_float_complex *B, const rocsolver_int ldb) 
+    const rocsolver_int *ipiv, rocblas_float_complex *B, const rocsolver_int ldb)
 {
   return rocsolver_getrs_impl<rocblas_float_complex>(handle, trans, n, nrhs, A, lda, ipiv, B, ldb);
 }
 
-extern "C" ROCSOLVER_EXPORT rocsolver_status 
+extern "C" ROCSOLVER_EXPORT rocsolver_status
 rocsolver_zgetrs(
     rocsolver_handle handle, const rocsolver_operation trans, const rocsolver_int n,
     const rocsolver_int nrhs, rocblas_double_complex *A, const rocsolver_int lda,
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs.hpp
index 1209770f..e18816df 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs.hpp
@@ -19,7 +19,7 @@ template <typename T, typename U>
 rocblas_status rocsolver_getrs_template(rocblas_handle handle, const rocblas_operation trans,
                          const rocblas_int n, const rocblas_int nrhs, U A, const rocblas_int shiftA,
                          const rocblas_int lda, const rocblas_int strideA, const rocblas_int *ipiv, const rocblas_int strideP, U B,
-                         const rocblas_int shiftB, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) 
+                         const rocblas_int shiftB, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count)
 {
     // quick return
     if (n == 0 || nrhs == 0 || batch_count == 0) {
@@ -56,7 +56,7 @@ rocblas_status rocsolver_getrs_template(rocblas_handle handle, const rocblas_ope
         for (int b = 0; b < batch_count; ++b) {
             Ap = load_ptr_batch<T>(AA,shiftA,b,strideA);
             Bp = load_ptr_batch<T>(BB,shiftB,b,strideB);
-            
+
             // solve L*X = B, overwriting B with X
             rocblas_trsm<T>(handle, rocblas_side_left, rocblas_fill_lower,
                     trans, rocblas_diagonal_unit, n, nrhs,
@@ -67,13 +67,13 @@ rocblas_status rocsolver_getrs_template(rocblas_handle handle, const rocblas_ope
                     trans, rocblas_diagonal_non_unit, n, nrhs,
                     oneInt, Ap, lda, Bp, ldb);
         }
-    
+
     } else {
 
         for (int b = 0; b < batch_count; ++b) {
             Ap = load_ptr_batch<T>(AA,shiftA,b,strideA);
             Bp = load_ptr_batch<T>(BB,shiftB,b,strideB);
-            
+
             // solve U**T *X = B or U**H *X = B, overwriting B with X
             rocblas_trsm<T>(handle, rocblas_side_left, rocblas_fill_upper, trans,
                     rocblas_diagonal_non_unit, n, nrhs,
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs_batched.cpp
index dd2dbe6a..43d48ac5 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs_batched.cpp
@@ -8,14 +8,14 @@
 template <typename T, typename U>
 rocblas_status rocsolver_getrs_batched_impl(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, U A, const rocblas_int lda,
-                 const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int batch_count) 
+                 const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int batch_count)
 {
     if(!handle)
         return rocblas_status_invalid_handle;
 
-    //logging is missing ???    
+    //logging is missing ???
 
-    if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0) 
+    if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0)
         return rocblas_status_invalid_size;
 
     if (!A || !ipiv || !B)
@@ -44,7 +44,7 @@ rocblas_status rocsolver_getrs_batched_impl(rocblas_handle handle, const rocblas
 extern "C" ROCSOLVER_EXPORT rocblas_status
 rocsolver_sgetrs_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, float *const A[], const rocblas_int lda,
-                 const rocblas_int *ipiv, const rocblas_int strideP, float *const B[], const rocblas_int ldb, const rocblas_int batch_count) 
+                 const rocblas_int *ipiv, const rocblas_int strideP, float *const B[], const rocblas_int ldb, const rocblas_int batch_count)
 {
   return rocsolver_getrs_batched_impl<float>(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count);
 }
@@ -52,26 +52,26 @@ rocsolver_sgetrs_batched(rocblas_handle handle, const rocblas_operation trans, c
 extern "C" ROCSOLVER_EXPORT rocblas_status
 rocsolver_dgetrs_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, double *const A[], const rocblas_int lda,
-                 const rocblas_int *ipiv, const rocblas_int strideP, double *const B[], const rocblas_int ldb, const rocblas_int batch_count) 
+                 const rocblas_int *ipiv, const rocblas_int strideP, double *const B[], const rocblas_int ldb, const rocblas_int batch_count)
 {
   return rocsolver_getrs_batched_impl<double>(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count);
 }
 
-extern "C" ROCSOLVER_EXPORT rocsolver_status 
+extern "C" ROCSOLVER_EXPORT rocsolver_status
 rocsolver_cgetrs_batched(
                  rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, rocblas_float_complex *const A[], const rocblas_int lda,
-                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *const B[], 
+                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *const B[],
                  const rocblas_int ldb, const rocblas_int batch_count)
 {
   return rocsolver_getrs_batched_impl<rocblas_float_complex>(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count);
 }
 
-extern "C" ROCSOLVER_EXPORT rocsolver_status 
+extern "C" ROCSOLVER_EXPORT rocsolver_status
 rocsolver_zgetrs_batched(
                  rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, rocblas_double_complex *const A[], const rocblas_int lda,
-                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *const B[], 
+                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *const B[],
                  const rocblas_int ldb, const rocblas_int batch_count)
 {
   return rocsolver_getrs_batched_impl<rocblas_double_complex>(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count);
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs_strided_batched.cpp
index 49ced525..e42302d3 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_getrs_strided_batched.cpp
@@ -7,14 +7,14 @@
 template <typename T, typename U>
 rocblas_status rocsolver_getrs_strided_batched_impl(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, U A, const rocblas_int lda, const rocblas_int strideA,
-                 const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) 
+                 const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count)
 {
     if(!handle)
         return rocblas_status_invalid_handle;
 
-    //logging is missing ???    
+    //logging is missing ???
 
-    if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0) 
+    if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0)
         return rocblas_status_invalid_size;
 
     if (!A || !ipiv || !B)
@@ -40,7 +40,7 @@ rocblas_status rocsolver_getrs_strided_batched_impl(rocblas_handle handle, const
 extern "C" ROCSOLVER_EXPORT rocblas_status
 rocsolver_sgetrs_strided_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, float *A, const rocblas_int lda, const rocblas_int strideA,
-                 const rocblas_int *ipiv, const rocblas_int strideP, float *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) 
+                 const rocblas_int *ipiv, const rocblas_int strideP, float *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count)
 {
   return rocsolver_getrs_strided_batched_impl<float>(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count);
 }
@@ -48,26 +48,26 @@ rocsolver_sgetrs_strided_batched(rocblas_handle handle, const rocblas_operation
 extern "C" ROCSOLVER_EXPORT rocblas_status
 rocsolver_dgetrs_strided_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, double *A, const rocblas_int lda, const rocblas_int strideA,
-                 const rocblas_int *ipiv, const rocblas_int strideP, double *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) 
+                 const rocblas_int *ipiv, const rocblas_int strideP, double *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count)
 {
   return rocsolver_getrs_strided_batched_impl<double>(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count);
 }
 
-extern "C" ROCSOLVER_EXPORT rocsolver_status 
+extern "C" ROCSOLVER_EXPORT rocsolver_status
 rocsolver_cgetrs_strided_batched(
                  rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, rocblas_float_complex *A, const rocblas_int lda, const rocblas_int strideA,
-                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *B, const rocblas_int ldb, 
+                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *B, const rocblas_int ldb,
                  const rocblas_int strideB, const rocblas_int batch_count)
 {
   return rocsolver_getrs_strided_batched_impl<rocblas_float_complex>(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count);
 }
 
-extern "C" ROCSOLVER_EXPORT rocsolver_status 
+extern "C" ROCSOLVER_EXPORT rocsolver_status
 rocsolver_zgetrs_strided_batched(
                  rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, rocblas_double_complex *A, const rocblas_int lda, const rocblas_int strideA,
-                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *B, const rocblas_int ldb, 
+                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *B, const rocblas_int ldb,
                  const rocblas_int strideB, const rocblas_int batch_count)
 {
   return rocsolver_getrs_strided_batched_impl<rocblas_double_complex>(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count);
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2.cpp
index 1ed3f0ee..0127cbe0 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2.cpp
@@ -5,14 +5,14 @@
 #include "roclapack_potf2.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_potf2_impl(rocblas_handle handle, const rocblas_fill uplo,    
-                                    const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info) 
-{ 
+rocblas_status rocsolver_potf2_impl(rocblas_handle handle, const rocblas_fill uplo,
+                                    const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !info)
         return rocblas_status_invalid_pointer;
     if (n < 0 || lda < n)
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2.hpp
index 4e1c3c91..518d202e 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2.hpp
@@ -18,9 +18,9 @@
 #include "common_device.hpp"
 #include "ideal_sizes.hpp"
 
-template <typename T, typename U> 
-__global__ void sqrtDiagOnward(U A, const rocblas_int shiftA, const rocblas_int strideA, const size_t loc, 
-                               const rocblas_int j, T *res, rocblas_int *info) 
+template <typename T, typename U>
+__global__ void sqrtDiagOnward(U A, const rocblas_int shiftA, const rocblas_int strideA, const size_t loc,
+                               const rocblas_int j, T *res, rocblas_int *info)
 {
     int id = hipBlockIdx_x;
 
@@ -45,10 +45,10 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle,
                                         const rocblas_fill uplo, const rocblas_int n, U A,
                                         const rocblas_int shiftA,
                                         const rocblas_int lda, const rocblas_int strideA,
-                                        rocblas_int *info, const rocblas_int batch_count) 
+                                        rocblas_int *info, const rocblas_int batch_count)
 {
     // quick return
-    if (n == 0 || batch_count == 0) 
+    if (n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     #ifdef batched
@@ -70,7 +70,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle,
     hipMemcpy(d_minone, &h_minone, sizeof(T), hipMemcpyHostToDevice);
 
     //diagonal info in device (device memory workspace to avoid synchronization with CPU)
-    T *pivotGPU; 
+    T *pivotGPU;
     hipMalloc(&pivotGPU, sizeof(T)*batch_count);
 
     hipStream_t stream;
@@ -95,7 +95,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle,
                 rocblas_dot<T>(handle, j, (M + idx2D(0, j, lda)), 1,
                                 (M + idx2D(0, j, lda)), 1, (pivotGPU + b));
             }
-            hipLaunchKernelGGL(sqrtDiagOnward<T>, dim3(batch_count), dim3(1), 0, stream, 
+            hipLaunchKernelGGL(sqrtDiagOnward<T>, dim3(batch_count), dim3(1), 0, stream,
                                A, shiftA, strideA, idx2D(j, j, lda), j, pivotGPU, info);
 
             // Compute elements J+1:N of row J
@@ -103,9 +103,9 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle,
                 for (int b=0;b<batch_count;++b) {
                     M = load_ptr_batch<T>(AA,shiftA,b,strideA);
                     rocblas_gemv<T>(handle, rocblas_operation_transpose, j, n - j - 1,
-                                    d_minone, (M + idx2D(0, j + 1, lda)), lda, 
+                                    d_minone, (M + idx2D(0, j + 1, lda)), lda,
                                     (M + idx2D(0, j, lda)), 1, d_one, (M + idx2D(j, j + 1, lda)), lda);
-                }    
+                }
                 for (int b=0;b<batch_count;++b) {
                     M = load_ptr_batch<T>(AA,shiftA,b,strideA);
                     rocblas_scal<T>(handle, n - j - 1, (pivotGPU + b),
@@ -122,7 +122,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle,
                 rocblas_dot<T>(handle, j, (M + idx2D(j, 0, lda)), lda,
                                 (M + idx2D(j, 0, lda)), lda, (pivotGPU + b));
             }
-            hipLaunchKernelGGL(sqrtDiagOnward<T>, dim3(batch_count), dim3(1), 0, stream, 
+            hipLaunchKernelGGL(sqrtDiagOnward<T>, dim3(batch_count), dim3(1), 0, stream,
                                A, shiftA, strideA, idx2D(j, j, lda), j, pivotGPU, info);
 
             // Compute elements J+1:N of row J
@@ -130,7 +130,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle,
                 for (int b=0;b<batch_count;++b) {
                     M = load_ptr_batch<T>(AA,shiftA,b,strideA);
                     rocblas_gemv<T>(handle, rocblas_operation_none, n - j - 1, j,
-                                    d_minone, (M + idx2D(j + 1, 0, lda)), lda, 
+                                    d_minone, (M + idx2D(j + 1, 0, lda)), lda,
                                     (M + idx2D(j, 0, lda)), lda, d_one, (M + idx2D(j + 1, j, lda)), 1);
                 }
                 for (int b=0;b<batch_count;++b) {
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2_batched.cpp
index 266cfa6a..84c16595 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2_batched.cpp
@@ -6,15 +6,15 @@
 #include "roclapack_potf2.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_potf2_batched_impl(rocblas_handle handle, const rocblas_fill uplo,    
-                                            const rocblas_int n, U A, const rocblas_int lda, 
-                                            rocblas_int* info, const rocblas_int batch_count) 
-{ 
+rocblas_status rocsolver_potf2_batched_impl(rocblas_handle handle, const rocblas_fill uplo,
+                                            const rocblas_int n, U A, const rocblas_int lda,
+                                            rocblas_int* info, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !info)
         return rocblas_status_invalid_pointer;
     if (n < 0 || lda < n || batch_count < 0)
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2_strided_batched.cpp
index 4988f364..4e88e448 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potf2_strided_batched.cpp
@@ -5,15 +5,15 @@
 #include "roclapack_potf2.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_potf2_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo,    
-                                            const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, 
-                                            rocblas_int* info, const rocblas_int batch_count) 
-{ 
+rocblas_status rocsolver_potf2_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo,
+                                            const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
+                                            rocblas_int* info, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !info)
         return rocblas_status_invalid_pointer;
     if (n < 0 || lda < n || batch_count < 0)
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf.cpp
index e0512eed..b8be605f 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf.cpp
@@ -5,14 +5,14 @@
 #include "roclapack_potrf.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_potrf_impl(rocblas_handle handle, const rocblas_fill uplo,    
-                                    const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info) 
-{ 
+rocblas_status rocsolver_potrf_impl(rocblas_handle handle, const rocblas_fill uplo,
+                                    const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !info)
         return rocblas_status_invalid_pointer;
     if (n < 0 || lda < n)
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf.hpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf.hpp
index 1f1c6650..aef657d4 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf.hpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf.hpp
@@ -19,12 +19,12 @@
 #include "ideal_sizes.hpp"
 #include "roclapack_potf2.hpp"
 
-inline __global__ void chk_positive(rocblas_int *iinfo, rocblas_int *info, int j) 
+inline __global__ void chk_positive(rocblas_int *iinfo, rocblas_int *info, int j)
 {
     int id = hipBlockIdx_x;
 
     if (info[id] == 0 && iinfo[id] > 0)
-            info[id] = iinfo[id] + j;   
+            info[id] = iinfo[id] + j;
 }
 
 template <typename T, typename U>
@@ -32,14 +32,14 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle,
                                         const rocblas_fill uplo, const rocblas_int n, U A,
                                         const rocblas_int shiftA,
                                         const rocblas_int lda, const rocblas_int strideA,
-                                        rocblas_int *info, const rocblas_int batch_count) 
+                                        rocblas_int *info, const rocblas_int batch_count)
 {
     // quick return
-    if (n == 0 || batch_count == 0) 
+    if (n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm
-    if (n < POTRF_POTF2_SWITCHSIZE) 
+    if (n < POTRF_POTF2_SWITCHSIZE)
         return rocsolver_potf2_template<T>(handle, uplo, n, A, shiftA, lda, strideA, info, batch_count);
 
     #ifdef batched
@@ -61,7 +61,7 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle,
     hipMemcpy(d_minone, &h_minone, sizeof(T), hipMemcpyHostToDevice);
 
     //info in device (device memory workspace to avoid synchronization with CPU)
-    rocblas_int *iinfo; 
+    rocblas_int *iinfo;
     hipMalloc(&iinfo, sizeof(rocblas_int)*batch_count);
 
     hipStream_t stream;
@@ -81,14 +81,14 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle,
 
     if (uplo == rocblas_fill_upper) { // Compute the Cholesky factorization A = U'*U.
         for (rocblas_int j = 0; j < n; j += POTRF_POTF2_SWITCHSIZE) {
-            // Factor diagonal and subdiagonal blocks 
+            // Factor diagonal and subdiagonal blocks
             jb = min(n - j, POTRF_POTF2_SWITCHSIZE);  //number of columns in the block
             hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,iinfo,batch_count,0);
             rocsolver_potf2_template<T>(handle, uplo, jb, A, shiftA + idx2D(j, j, lda), lda, strideA, iinfo, batch_count);
-            
+
             // test for non-positive-definiteness.
             hipLaunchKernelGGL(chk_positive,gridReset,threads,0,stream,iinfo,info,j);
-            
+
             if (j + jb < n) {
                 // update trailing submatrix
                 for (int b=0;b<batch_count;++b) {
@@ -98,7 +98,7 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle,
                              (M + idx2D(j, j, lda)), lda, (M + idx2D(j, j + jb, lda)), lda);
                 }
 
-                // *** GEMM MUST BE REPLACED BY SYRK ONCE IT IS AVAILABLE IN ROCBLAS ****                
+                // *** GEMM MUST BE REPLACED BY SYRK ONCE IT IS AVAILABLE IN ROCBLAS ****
                 for (int b=0;b<batch_count;++b) {
                     M = load_ptr_batch<T>(AA,shiftA,b,strideA);
                     rocblas_gemm(handle, rocblas_operation_transpose, rocblas_operation_none,
@@ -112,14 +112,14 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle,
 
     } else { // Compute the Cholesky factorization A = L'*L.
         for (rocblas_int j = 0; j < n; j += POTRF_POTF2_SWITCHSIZE) {
-            // Factor diagonal and subdiagonal blocks 
+            // Factor diagonal and subdiagonal blocks
             jb = min(n - j, POTRF_POTF2_SWITCHSIZE);  //number of columns in the block
             hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,iinfo,batch_count,0);
             rocsolver_potf2_template<T>(handle, uplo, jb, A, shiftA + idx2D(j, j, lda), lda, strideA, iinfo, batch_count);
-            
+
             // test for non-positive-definiteness.
             hipLaunchKernelGGL(chk_positive,gridReset,threads,0,stream,iinfo,info,j);
-            
+
             if (j + jb < n) {
                 // update trailing submatrix
                 for (int b=0;b<batch_count;++b) {
@@ -129,7 +129,7 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle,
                              (M + idx2D(j, j, lda)), lda, (M + idx2D(j + jb, j, lda)), lda);
                 }
 
-                // *** GEMM MUST BE REPLACED BY SYRK ONCE IT IS AVAILABLE IN ROCBLAS ****                
+                // *** GEMM MUST BE REPLACED BY SYRK ONCE IT IS AVAILABLE IN ROCBLAS ****
                 for (int b=0;b<batch_count;++b) {
                     M = load_ptr_batch<T>(AA,shiftA,b,strideA);
                     rocblas_gemm(handle, rocblas_operation_none, rocblas_operation_transpose,
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf_batched.cpp
index 7ac5061e..06dda30c 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf_batched.cpp
@@ -6,15 +6,15 @@
 #include "roclapack_potrf.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_potrf_batched_impl(rocblas_handle handle, const rocblas_fill uplo,    
-                                            const rocblas_int n, U A, const rocblas_int lda, 
-                                            rocblas_int* info, const rocblas_int batch_count) 
-{ 
+rocblas_status rocsolver_potrf_batched_impl(rocblas_handle handle, const rocblas_fill uplo,
+                                            const rocblas_int n, U A, const rocblas_int lda,
+                                            rocblas_int* info, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !info)
         return rocblas_status_invalid_pointer;
     if (n < 0 || lda < n || batch_count < 0)
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf_strided_batched.cpp
index 2e49ab4b..6c081fc4 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/lapack/roclapack_potrf_strided_batched.cpp
@@ -5,15 +5,15 @@
 #include "roclapack_potrf.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_potrf_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo,    
-                                            const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, 
-                                            rocblas_int* info, const rocblas_int batch_count) 
-{ 
+rocblas_status rocsolver_potrf_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo,
+                                            const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
+                                            rocblas_int* info, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !info)
         return rocblas_status_invalid_pointer;
     if (n < 0 || lda < n || batch_count < 0)
diff --git a/ROCm_Libraries/rocSOLVER/docs/library/src/rocsolver-config.cmake.in b/ROCm_Libraries/rocSOLVER/docs/library/src/rocsolver-config.cmake.in
index 970adc43..8b6304e0 100644
--- a/ROCm_Libraries/rocSOLVER/docs/library/src/rocsolver-config.cmake.in
+++ b/ROCm_Libraries/rocSOLVER/docs/library/src/rocsolver-config.cmake.in
@@ -1,6 +1,6 @@
 
 @PACKAGE_INIT@
-    
+
 set_and_check(rocsolver_INCLUDE_DIR @PACKAGE_INCLUDE_INSTALL_DIR@)
 
 set_and_check(rocsolver_INCLUDE_DIRS @PACKAGE_INCLUDE_INSTALL_DIR@)
diff --git a/ROCm_Libraries/rocSOLVER/docs/source/api.rst b/ROCm_Libraries/rocSOLVER/docs/source/api.rst
index 4068d267..690a60a8 100644
--- a/ROCm_Libraries/rocSOLVER/docs/source/api.rst
+++ b/ROCm_Libraries/rocSOLVER/docs/source/api.rst
@@ -1,12 +1,12 @@
 .. toctree::
-   :maxdepth: 4 
+   :maxdepth: 4
    :caption: Contents:
 
 *************
 rocSOLVER API - (Documentation in progress....)
 *************
 
-This section provides details of the rocSOLVER library API as of Release 
+This section provides details of the rocSOLVER library API as of Release
 `ROCm 2.10 <https://github.com/ROCmSoftwarePlatform/rocSOLVER/tree/master-rocm-2.10>`_.
 
 
@@ -14,7 +14,7 @@ This section provides details of the rocSOLVER library API as of Release
 Types
 =====
 
-Most rocSOLVER types are aliases of rocBLAS types. 
+Most rocSOLVER types are aliases of rocBLAS types.
 See rocBLAS types `here <https://rocblas.readthedocs.io/en/latest/api.html#types>`_.
 
 Definitions
@@ -312,7 +312,7 @@ rocsolver_<type>getrs_strided_batched()
 Auxiliaries
 =========================
 
-rocSOLVER auxiliary functions are aliases of rocBLAS auxiliary functions. See rocBLAS auxiliary functions 
+rocSOLVER auxiliary functions are aliases of rocBLAS auxiliary functions. See rocBLAS auxiliary functions
 `here <https://rocblas.readthedocs.io/en/latest/api.html#auxiliary>`_.
 
 rocSOLVER handle auxiliaries
diff --git a/ROCm_Libraries/rocSOLVER/docs/source/index.rst b/ROCm_Libraries/rocSOLVER/docs/source/index.rst
index 91296248..b586bf8e 100644
--- a/ROCm_Libraries/rocSOLVER/docs/source/index.rst
+++ b/ROCm_Libraries/rocSOLVER/docs/source/index.rst
@@ -4,9 +4,9 @@ Welcome to rocSOLVER's documentation!
 =======================================
 
 .. toctree::
-   :maxdepth: 4 
+   :maxdepth: 4
    :caption: Contents:
 
-   library 
+   library
    api
 
diff --git a/ROCm_Libraries/rocSOLVER/docs/source/library.rst b/ROCm_Libraries/rocSOLVER/docs/source/library.rst
index 7bbf839d..202bd844 100644
--- a/ROCm_Libraries/rocSOLVER/docs/source/library.rst
+++ b/ROCm_Libraries/rocSOLVER/docs/source/library.rst
@@ -1,30 +1,30 @@
 
 .. toctree::
-   :maxdepth: 4 
+   :maxdepth: 4
    :caption: Contents:
 
 *************
 Introduction
 *************
 
-An implementation of Lapack routines on top of AMD’s Radeon Open Compute Platform (ROCm) runtime and toolchains. 
-rocSOLVER is implemented in the HIP programming language; it is based on rocBLAS, an optimized BLAS 
-implementation for AMD’s latest discrete GPUs. More information about rocBLAS can be found 
+An implementation of Lapack routines on top of AMD's Radeon Open Compute Platform (ROCm) runtime and toolchains.
+rocSOLVER is implemented in the HIP programming language; it is based on rocBLAS, an optimized BLAS
+implementation for AMD's latest discrete GPUs. More information about rocBLAS can be found
 `here <https://rocblas.readthedocs.io/en/latest/index.html>`_.
 
 Build and install
 ===================
 
-rocSOLVER requires `cmake <https://cmake.org/install/>`_ 
-and `ROCm <https://rocm.github.io/install.html>`_, including 
-`hip <https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md>`_ and 
-`rocBLAS <https://github.com/ROCmSoftwarePlatform/rocBLAS>`_, to be installed. 
+rocSOLVER requires `cmake <https://cmake.org/install/>`_
+and `ROCm <https://rocm.github.io/install.html>`_, including
+`hip <https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md>`_ and
+`rocBLAS <https://github.com/ROCmSoftwarePlatform/rocBLAS>`_, to be installed.
 
 Once these requirements are satisfied, the following
 instructions will build and install rocSOLVER:
 
 .. code-block:: bash
-   
+
      mkdir build && cd build
     CXX=/opt/rocm/bin/hcc cmake ..
     make
@@ -33,56 +33,56 @@ instructions will build and install rocSOLVER:
 Brief description and functionality
 ====================================
 
-rocSolver Library is in early stages of active development. New features and functionality is being continuosly added. New 
-functionality is documented at each release of the ROCm platform. 
+rocSolver Library is in early stages of active development. New features and functionality is being continuosly added. New
+functionality is documented at each release of the ROCm platform.
 
 The following table summarizes the LAPACK functionality implemented in rocSOLVER's last release.
 
 =============================== ====== ====== ============== ==============
 Lapack Auxiliary Function       single double single complex double complex
 =============================== ====== ====== ============== ==============
-**rocsolver_laswp**             x      x         x              x 
-**rocsolver_larfg**             x      x                        
+**rocsolver_laswp**             x      x         x              x
+**rocsolver_larfg**             x      x
 **rocsolver_larft**             x      x
 **rocsolver_larf**              x      x
-**rocsolver_larfb**             x      x      
-**rocsolver_org2r**             x      x      
-**rocsolver_orgqr**             x      x      
-**rocsolver_orgl2**             x      x      
-**rocsolver_orglq**             x      x      
-**rocsolver_orgbr**             x      x      
-**rocsolver_orm2r**             x      x      
-**rocsolver_ormqr**             x      x      
+**rocsolver_larfb**             x      x
+**rocsolver_org2r**             x      x
+**rocsolver_orgqr**             x      x
+**rocsolver_orgl2**             x      x
+**rocsolver_orglq**             x      x
+**rocsolver_orgbr**             x      x
+**rocsolver_orm2r**             x      x
+**rocsolver_ormqr**             x      x
 =============================== ====== ====== ============== ==============
 
 =============================== ====== ====== ============== ==============
 Lapack Function                 single double single complex double complex
 =============================== ====== ====== ============== ==============
-**rocsolver_potf2**             x      x                        
-rocsolver_potf2_batched         x      x                       
-rocsolver_potf2_strided_batched x      x                       
-**rocsolver_potrf**             x      x                        
-rocsolver_potrf_batched         x      x                       
-rocsolver_potrf_strided_batched x      x                       
+**rocsolver_potf2**             x      x
+rocsolver_potf2_batched         x      x
+rocsolver_potf2_strided_batched x      x
+**rocsolver_potrf**             x      x
+rocsolver_potrf_batched         x      x
+rocsolver_potrf_strided_batched x      x
 **rocsolver_getf2**             x      x          x             x
 rocsolver_getf2_batched         x      x          x             x
 rocsolver_getf2_strided_batched x      x          x             x
-**rocsolver_getrf**             x      x          x             x 
+**rocsolver_getrf**             x      x          x             x
 rocsolver_getrf_batched         x      x          x             x
 rocsolver_getrf_strided_batched x      x          x             x
-**rocsolver_geqr2**             x      x                        
+**rocsolver_geqr2**             x      x
 rocsolver_geqr2_batched         x      x
 rocsolver_geqr2_strided_batched x      x
-**rocsolver_geqrf**             x      x                        
-rocsolver_geqrf_batched         x      x 
+**rocsolver_geqrf**             x      x
+rocsolver_geqrf_batched         x      x
 rocsolver_geqrf_strided_batched x      x
-**rocsolver_gelq2**             x      x                        
+**rocsolver_gelq2**             x      x
 rocsolver_gelq2_batched         x      x
 rocsolver_gelq2_strided_batched x      x
-**rocsolver_gelqf**             x      x                        
-rocsolver_gelqf_batched         x      x 
+**rocsolver_gelqf**             x      x
+rocsolver_gelqf_batched         x      x
 rocsolver_gelqf_strided_batched x      x
-**rocsolver_getrs**             x      x          x             x 
+**rocsolver_getrs**             x      x          x             x
 rocsolver_getrs_batched         x      x          x             x
 rocsolver_getrs_strided_batched x      x          x             x
 =============================== ====== ====== ============== ==============
@@ -90,30 +90,30 @@ rocsolver_getrs_strided_batched x      x          x             x
 Benchmarking and testing
 ==========================
 
-Additionally, rocSOLVER has a basic/preliminary infrastructure for testing and benchmarking similar to that of rocBLAS. 
+Additionally, rocSOLVER has a basic/preliminary infrastructure for testing and benchmarking similar to that of rocBLAS.
 
-On a normal installation, clients should be located in the directory **<rocsolverDIR>/build/clients/staging**. 
+On a normal installation, clients should be located in the directory **<rocsolverDIR>/build/clients/staging**.
 
 **rocsolver-test** executes a suite of `Google tests <https://github.com/google/googletest>`_ (*gtest*) that verifies the correct
-functioning of the library; the results computed by rocSOLVER, for random input data, are compared with the results computed by 
+functioning of the library; the results computed by rocSOLVER, for random input data, are compared with the results computed by
 `NETLib LAPACK <http://www.netlib.org/lapack/>`_ on the CPU.
 
 Calling the rocSOLVER gtest client with the --help flag
 
 .. code-block:: bash
-    
+
     ./rocsolver-test --help
 
-returns information on different flags that control the behavior of the gtests.   
+returns information on different flags that control the behavior of the gtests.
 
 **rocsolver-bench** allows to run any rocSOLVER function with random data of the specified dimensions; it compares the computed results, and provides basic
-performance information (as for now, execution times). 
+performance information (as for now, execution times).
 
-Similarly, 
+Similarly,
 
 .. code-block:: bash
-    
+
     ./rocsolver-bench --help
 
-returns information on how to use the rocSOLVER benchmark client.   
- 
+returns information on how to use the rocSOLVER benchmark client.
+
diff --git a/ROCm_Libraries/rocSOLVER/index.rst b/ROCm_Libraries/rocSOLVER/index.rst
index 91296248..b586bf8e 100644
--- a/ROCm_Libraries/rocSOLVER/index.rst
+++ b/ROCm_Libraries/rocSOLVER/index.rst
@@ -4,9 +4,9 @@ Welcome to rocSOLVER's documentation!
 =======================================
 
 .. toctree::
-   :maxdepth: 4 
+   :maxdepth: 4
    :caption: Contents:
 
-   library 
+   library
    api
 
diff --git a/ROCm_Libraries/rocSOLVER/library/include/rocsolver-functions.h b/ROCm_Libraries/rocSOLVER/library/include/rocsolver-functions.h
index cd388512..3fbbfaf4 100644
--- a/ROCm_Libraries/rocSOLVER/library/include/rocsolver-functions.h
+++ b/ROCm_Libraries/rocSOLVER/library/include/rocsolver-functions.h
@@ -42,7 +42,7 @@ extern "C" {
     n               rocsolver_int. n >= 0.\n
                     The number of columns of the matrix A.
     @param[inout]
-    A               pointer to type. Array on the GPU of dimension lda*n. \n 
+    A               pointer to type. Array on the GPU of dimension lda*n. \n
                     On entry, the matrix of column dimension n to which the row
                     interchanges will be applied. On exit, the permuted matrix.
     @param[in]
@@ -59,7 +59,7 @@ extern "C" {
     @param[in]
     ipiv            pointer to rocsolver_int. Array on the GPU of dimension at least k1 + (k2 - k1) * abs(incx).\n
                     The vector of pivot indices.  Only the elements in positions
-                    k1 through (k1 + (k2 - k1) * abs(incx)) of IPIV are accessed. 
+                    k1 through (k1 + (k2 - k1) * abs(incx)) of IPIV are accessed.
                     Elements of ipiv are considered 1-based.
     @param[in]
     incx            rocsolver_int. incx != 0.\n
@@ -67,92 +67,92 @@ extern "C" {
                     is negative, the pivots are applied in reverse order.
     *************************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_slaswp(rocsolver_handle handle, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_slaswp(rocsolver_handle handle,
                                                    const rocsolver_int n,
-                                                   float *A, 
-                                                   const rocsolver_int lda, 
-                                                   const rocsolver_int k1, 
-                                                   const rocsolver_int k2, 
-                                                   const rocsolver_int *ipiv, 
+                                                   float *A,
+                                                   const rocsolver_int lda,
+                                                   const rocsolver_int k1,
+                                                   const rocsolver_int k2,
+                                                   const rocsolver_int *ipiv,
                                                    const rocsolver_int incx);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dlaswp(rocsolver_handle handle, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dlaswp(rocsolver_handle handle,
                                                    const rocsolver_int n,
-                                                   double *A, 
-                                                   const rocsolver_int lda, 
-                                                   const rocsolver_int k1, 
-                                                   const rocsolver_int k2, 
-                                                   const rocsolver_int *ipiv, 
+                                                   double *A,
+                                                   const rocsolver_int lda,
+                                                   const rocsolver_int k1,
+                                                   const rocsolver_int k2,
+                                                   const rocsolver_int *ipiv,
                                                    const rocsolver_int incx);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_claswp(rocsolver_handle handle, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_claswp(rocsolver_handle handle,
                                                    const rocsolver_int n,
-                                                   rocblas_float_complex *A, 
-                                                   const rocsolver_int lda, 
-                                                   const rocsolver_int k1, 
-                                                   const rocsolver_int k2, 
-                                                   const rocsolver_int *ipiv, 
+                                                   rocblas_float_complex *A,
+                                                   const rocsolver_int lda,
+                                                   const rocsolver_int k1,
+                                                   const rocsolver_int k2,
+                                                   const rocsolver_int *ipiv,
                                                    const rocsolver_int incx);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_zlaswp(rocsolver_handle handle, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_zlaswp(rocsolver_handle handle,
                                                    const rocsolver_int n,
-                                                   rocblas_double_complex *A, 
-                                                   const rocsolver_int lda, 
-                                                   const rocsolver_int k1, 
-                                                   const rocsolver_int k2, 
-                                                   const rocsolver_int *ipiv, 
+                                                   rocblas_double_complex *A,
+                                                   const rocsolver_int lda,
+                                                   const rocsolver_int k1,
+                                                   const rocsolver_int k2,
+                                                   const rocsolver_int *ipiv,
                                                    const rocsolver_int incx);
 
-/*! \brief LARFG generates an orthogonal Householder reflector H of order n. 
+/*! \brief LARFG generates an orthogonal Householder reflector H of order n.
 
     \details
     Householder reflector H is such that
- 
+
         H * [alpha] = [beta]
             [  x  ]   [  0 ]
 
-    where x is an n-1 vector and alpha and beta are scalars. Matrix H can be 
+    where x is an n-1 vector and alpha and beta are scalars. Matrix H can be
     generated as
-    
+
         H = I - tau * [1] * [1 v']
                       [v]
 
-    with v an n-1 vector and tau a scalar. 
+    with v an n-1 vector and tau a scalar.
 
     @param[in]
     handle          rocsolver_handle
     @param[in]
     n               rocsolver_int. n >= 0.\n
-                    The order (size) of reflector H. 
+                    The order (size) of reflector H.
     @param[inout]
     alpha           pointer to type. A scalar on the GPU.\n
-                    On input the scalar alpha, 
+                    On input the scalar alpha,
                     on output it is overwritten with beta.
-    @param[inout]      
+    @param[inout]
     x               pointer to type. Array on the GPU of size at least n-1.\n
-                    On input it is the vector x, 
+                    On input it is the vector x,
                     on output it is overwritten with vector v.
     @param[in]
     incx            rocsolver_int. incx > 0.\n
-                    The increment between consecutive elements of x. 
+                    The increment between consecutive elements of x.
     @param[out]
     tau             pointer to type. A scalar on the GPU.\n
                     The scalar tau.
 
     *************************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_slarfg(rocsolver_handle handle, 
-                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_slarfg(rocsolver_handle handle,
+                                                 const rocsolver_int n,
                                                  float *alpha,
-                                                 float *x, 
-                                                 const rocsolver_int incx, 
+                                                 float *x,
+                                                 const rocsolver_int incx,
                                                  float *tau);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle, 
-                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle,
+                                                 const rocsolver_int n,
                                                  double *alpha,
-                                                 double *x, 
-                                                 const rocsolver_int incx, 
+                                                 double *x,
+                                                 const rocsolver_int incx,
                                                  double *tau);
 
 
@@ -164,9 +164,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle,
         H = H(1) * H(2) * ... * H(k)  (forward direction), or
         H = H(k) * ... * H(2) * H(1)  (backward direction)
 
-    depending on the value of direct.  
+    depending on the value of direct.
 
-    The triangular matrix T is upper triangular in forward direction and lower triangular in backward direction. 
+    The triangular matrix T is upper triangular in forward direction and lower triangular in backward direction.
     If storev is column-wise, then
 
         H = I - V * T * V'
@@ -175,7 +175,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle,
 
         H = I - V' * T * V
 
-    where the i-th row of matrix V contains the Householder vector associated to H(i). 
+    where the i-th row of matrix V contains the Householder vector associated to H(i).
 
     @param[in]
     handle              rocsolver_handle.
@@ -188,10 +188,10 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle,
     @param[in]
     n                   rocsolver_int. n >= 0.\n
                         The order (size) of the block reflector.
-    @param[in]          
+    @param[in]
     k                   rocsovler_int. k >= 1.\n
                         The number of Householder matrices.
-    @param[in]          
+    @param[in]
     V                   pointer to type. Array on the GPU of size ldv*k if column-wise, or ldv*n if row-wise.\n
                         The matrix of Householder vectors.
     @param[in]
@@ -203,44 +203,44 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfg(rocsolver_handle handle,
     @param[out]
     T                   pointer to type. Array on the GPU of dimension ldt*k.\n
                         The triangular factor. T is upper triangular is forward operation, otherwise it is lower triangular.
-                        The rest of the array is not used. 
-    @param[in]  
+                        The rest of the array is not used.
+    @param[in]
     ldt                 rocsolver_int. ldt >= k.\n
                         The leading dimension of T.
 
-    **************************************************************************/ 
+    **************************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_slarft(rocsolver_handle handle,
-                                                 const rocsolver_direct direct, 
+                                                 const rocsolver_direct direct,
                                                  const rocsolver_storev storev,
-                                                 const rocsolver_int n, 
+                                                 const rocsolver_int n,
                                                  const rocsolver_int k,
                                                  float *V,
                                                  const rocsolver_int ldv,
                                                  float *tau,
-                                                 float *T, 
-                                                 const rocsolver_int ldt); 
+                                                 float *T,
+                                                 const rocsolver_int ldt);
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarft(rocsolver_handle handle,
                                                  const rocsolver_direct direct,
-                                                 const rocsolver_storev storev, 
-                                                 const rocsolver_int n, 
+                                                 const rocsolver_storev storev,
+                                                 const rocsolver_int n,
                                                  const rocsolver_int k,
                                                  double *V,
                                                  const rocsolver_int ldv,
                                                  double *tau,
-                                                 double *T, 
-                                                 const rocsolver_int ldt); 
+                                                 double *T,
+                                                 const rocsolver_int ldt);
 
 
 /*! \brief LARF applies a Householder reflector H to a general matrix A.
 
     \details
     The Householder reflector H, of order m (or n), is to be applied to a m-by-n matrix A
-    from the left (or the right). H is given by 
+    from the left (or the right). H is given by
 
         H = I - alpha * x * x'
-    
+
     where alpha is a scalar and x a Householder vector. H is never actually computed.
 
     @param[in]
@@ -254,16 +254,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarft(rocsolver_handle handle,
                     Number of rows of A.
     @param[in]
     n               rocsolver_int. n >= 0.\n
-                    Number of columns of A. 
+                    Number of columns of A.
     @param[in]
-    x               pointer to type. Array on the GPU of  
+    x               pointer to type. Array on the GPU of
                     size at least (1 + (m-1)*abs(incx)) if left side, or
                     at least (1 + (n-1)*abs(incx)) if right side.\n
                     The Householder vector x.
     @param[in]
     incx            rocsolver_int. incx != 0.\n
-                    Increment between to consecutive elements of x. 
-                    If incx < 0, the elements of x are used in reverse order. 
+                    Increment between to consecutive elements of x.
+                    If incx < 0, the elements of x are used in reverse order.
     @param[in]
     alpha           pointer to type. A scalar on the GPU.\n
                     If alpha = 0, then H = I (A will remain the same, x is never used)
@@ -273,35 +273,35 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarft(rocsolver_handle handle,
                     H*A (or A*H).
     @param[in]
     lda             rocsolver_int. lda >= m.\n
-                    Leading dimension of A. 
-                        
+                    Leading dimension of A.
+
     *************************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_slarf(rocsolver_handle handle, 
-                                                const rocsolver_side side, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_slarf(rocsolver_handle handle,
+                                                const rocsolver_side side,
                                                 const rocsolver_int m,
-                                                const rocsolver_int n, 
-                                                float* x, 
-                                                const rocsolver_int incx, 
+                                                const rocsolver_int n,
+                                                float* x,
+                                                const rocsolver_int incx,
                                                 const float* alpha,
-                                                float* A, 
+                                                float* A,
                                                 const rocsolver_int lda);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarf(rocsolver_handle handle, 
-                                                const rocsolver_side side, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarf(rocsolver_handle handle,
+                                                const rocsolver_side side,
                                                 const rocsolver_int m,
-                                                const rocsolver_int n, 
-                                                double* x, 
-                                                const rocsolver_int incx, 
+                                                const rocsolver_int n,
+                                                double* x,
+                                                const rocsolver_int incx,
                                                 const double* alpha,
-                                                double* A, 
+                                                double* A,
                                                 const rocsolver_int lda);
 
 
 /*! \brief LARFB applies a block reflector H to a general m-by-n matrix A.
 
     \details
-    The block reflector H is applied in one of the following forms, depending on 
+    The block reflector H is applied in one of the following forms, depending on
     the values of side and trans:
 
         H  * A  (No transpose from the left)
@@ -322,7 +322,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarf(rocsolver_handle handle,
 
         H = I - V' * T * V
 
-    where the i-th row of matrix V contains the Householder vector associated to H(i), if storev is row-wise. 
+    where the i-th row of matrix V contains the Householder vector associated to H(i), if storev is row-wise.
     T is the associated triangular factor as computed by LARFT.
 
     @param[in]
@@ -345,11 +345,11 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarf(rocsolver_handle handle,
     @param[in]
     n                   rocsolver_int. n >= 0.\n
                         Number of columns of matrix A.
-    @param[in]          
+    @param[in]
     k                   rocsovler_int. k >= 1.\n
                         The number of Householder matrices.
-    @param[in]          
-    V                   pointer to type. Array on the GPU of size ldv*k if column-wise, ldv*n if row-wise and applying from the right, 
+    @param[in]
+    V                   pointer to type. Array on the GPU of size ldv*k if column-wise, ldv*n if row-wise and applying from the right,
                         or ldv*m if row-wise and applying from the left.\n
                         The matrix of Householder vectors.
     @param[in]
@@ -359,16 +359,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarf(rocsolver_handle handle,
     @param[in]
     T                   pointer to type. Array on the GPU of dimension ldt*k.\n
                         The triangular factor of the block reflector.
-    @param[in]  
+    @param[in]
     ldt                 rocsolver_int. ldt >= k.\n
                         The leading dimension of T.
     @param[inout]
     A                   pointer to type. Array on the GPU of size lda*n.\n
                         On input, the matrix A. On output it is overwritten with
-                        H*A, A*H, H'*A, or A*H'.  
+                        H*A, A*H, H'*A, or A*H'.
     @param[in]
     lda                 rocsolver_int. lda >= m.\n
-                        Leading dimension of A. 
+                        Leading dimension of A.
 
     ****************************************************************************/
 
@@ -376,31 +376,31 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_slarfb(rocsolver_handle handle,
                                                  const rocsolver_side side,
                                                  const rocsolver_operation trans,
                                                  const rocsolver_direct direct,
-                                                 const rocsolver_storev storev, 
+                                                 const rocsolver_storev storev,
                                                  const rocsolver_int m,
-                                                 const rocsolver_int n, 
+                                                 const rocsolver_int n,
                                                  const rocsolver_int k,
                                                  float *V,
                                                  const rocsolver_int ldv,
-                                                 float *T, 
+                                                 float *T,
                                                  const rocsolver_int ldt,
                                                  float *A,
-                                                 const rocsolver_int lda); 
+                                                 const rocsolver_int lda);
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfb(rocsolver_handle handle,
                                                  const rocsolver_side side,
                                                  const rocsolver_operation trans,
-                                                 const rocsolver_direct direct, 
-                                                 const rocsolver_storev storev, 
+                                                 const rocsolver_direct direct,
+                                                 const rocsolver_storev storev,
                                                  const rocsolver_int m,
-                                                 const rocsolver_int n, 
+                                                 const rocsolver_int n,
                                                  const rocsolver_int k,
                                                  double *V,
                                                  const rocsolver_int ldv,
-                                                 double *T, 
+                                                 double *T,
                                                  const rocsolver_int ldt,
                                                  double *A,
-                                                 const rocsolver_int lda); 
+                                                 const rocsolver_int lda);
 
 /*! \brief ORG2R generates a m-by-n Matrix Q with orthonormal columns.
 
@@ -409,17 +409,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfb(rocsolver_handle handle,
 
     The matrix Q is defined as the first n columns of the product of k Householder
     reflectors of order m
-    
+
         Q = H(1) * H(2) * ... * H(k)
 
-    Householder matrices H(i) are never stored, they are computed from its corresponding 
+    Householder matrices H(i) are never stored, they are computed from its corresponding
     Householder vector v(i) and scalar ipiv_i as returned by GEQRF.
 
     @param[in]
     handle      rocsolver_handle.
     @param[in]
     m           rocsolver_int. m >= 0.\n
-                The number of rows of the matrix Q. 
+                The number of rows of the matrix Q.
     @param[in]
     n           rocsolver_int. 0 <= n <= m.\n
                 The number of colums of the matrix Q.
@@ -433,7 +433,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfb(rocsolver_handle handle,
                 On exit, the computed matrix Q.
     @param[in]
     lda         rocsolver_int. lda >= m.\n
-                Specifies the leading dimension of A. 
+                Specifies the leading dimension of A.
     @param[in]
     ipiv        pointer to type. Array on the GPU of dimension at least k.\n
                 The scalar factors of the Householder matrices H(i) as returned by GEQRF.
@@ -442,16 +442,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dlarfb(rocsolver_handle handle,
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sorg2r(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    float *ipiv);
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dorg2r(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    double *ipiv);
@@ -463,17 +463,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorg2r(rocsolver_handle handle,
 
     The matrix Q is defined as the first n columns of the product of k Householder
     reflectors of order m
-    
+
         Q = H(1) * H(2) * ... * H(k)
 
-    Householder matrices H(i) are never stored, they are computed from its corresponding 
+    Householder matrices H(i) are never stored, they are computed from its corresponding
     Householder vector v(i) and scalar ipiv_i as returned by GEQRF.
 
     @param[in]
     handle      rocsolver_handle.
     @param[in]
     m           rocsolver_int. m >= 0.\n
-                The number of rows of the matrix Q. 
+                The number of rows of the matrix Q.
     @param[in]
     n           rocsolver_int. 0 <= n <= m.\n
                 The number of colums of the matrix Q.
@@ -487,7 +487,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorg2r(rocsolver_handle handle,
                 On exit, the computed matrix Q.
     @param[in]
     lda         rocsolver_int. lda >= m.\n
-                Specifies the leading dimension of A. 
+                Specifies the leading dimension of A.
     @param[in]
     ipiv        pointer to type. Array on the GPU of dimension at least k.\n
                 The scalar factors of the Householder matrices H(i) as returned by GEQRF.
@@ -496,16 +496,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorg2r(rocsolver_handle handle,
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sorgqr(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    float *ipiv);
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgqr(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    double *ipiv);
@@ -517,17 +517,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgqr(rocsolver_handle handle,
 
     The matrix Q is defined as the first m rows of the product of k Householder
     reflectors of order n
-    
+
         Q = H(k) * H(k-1) * ... * H(1)
 
-    Householder matrices H(i) are never stored, they are computed from its corresponding 
+    Householder matrices H(i) are never stored, they are computed from its corresponding
     Householder vector v(i) and scalar ipiv_i as returned by GELQF.
 
     @param[in]
     handle      rocsolver_handle.
     @param[in]
     m           rocsolver_int. 0 <= m <= n.\n
-                The number of rows of the matrix Q. 
+                The number of rows of the matrix Q.
     @param[in]
     n           rocsolver_int. n >= 0.\n
                 The number of colums of the matrix Q.
@@ -541,7 +541,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgqr(rocsolver_handle handle,
                 On exit, the computed matrix Q.
     @param[in]
     lda         rocsolver_int. lda >= m.\n
-                Specifies the leading dimension of A. 
+                Specifies the leading dimension of A.
     @param[in]
     ipiv        pointer to type. Array on the GPU of dimension at least k.\n
                 The scalar factors of the Householder matrices H(i) as returned by GELQF.
@@ -550,16 +550,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgqr(rocsolver_handle handle,
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sorgl2(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    float *ipiv);
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgl2(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    double *ipiv);
@@ -572,17 +572,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgl2(rocsolver_handle handle,
 
     The matrix Q is defined as the first m rows of the product of k Householder
     reflectors of order n
-    
+
         Q = H(k) * H(k-1) * ... * H(1)
 
-    Householder matrices H(i) are never stored, they are computed from its corresponding 
+    Householder matrices H(i) are never stored, they are computed from its corresponding
     Householder vector v(i) and scalar ipiv_i as returned by GELQF.
 
     @param[in]
     handle      rocsolver_handle.
     @param[in]
     m           rocsolver_int. 0 <= m <= n.\n
-                The number of rows of the matrix Q. 
+                The number of rows of the matrix Q.
     @param[in]
     n           rocsolver_int. n >= 0.\n
                 The number of colums of the matrix Q.
@@ -596,7 +596,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgl2(rocsolver_handle handle,
                 On exit, the computed matrix Q.
     @param[in]
     lda         rocsolver_int. lda >= m.\n
-                Specifies the leading dimension of A. 
+                Specifies the leading dimension of A.
     @param[in]
     ipiv        pointer to type. Array on the GPU of dimension at least k.\n
                 The scalar factors of the Householder matrices H(i) as returned by GELQF.
@@ -605,16 +605,16 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgl2(rocsolver_handle handle,
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sorglq(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    float *ipiv);
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    double *ipiv);
@@ -622,9 +622,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle,
 /*! \brief ORGBR generates a m-by-n Matrix Q with orthonormal rows or columns.
 
     \details
-    If storev is column-wise, then the matrix Q has orthonormal columns. If m >= k, Q is defined as the first 
+    If storev is column-wise, then the matrix Q has orthonormal columns. If m >= k, Q is defined as the first
     n columns of the product of k Householder reflectors of order m
-    
+
         Q = H(1) * H(2) * ... * H(k)
 
     If m < k, Q is defined as the product of Householder reflectors of order m
@@ -635,12 +635,12 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle,
     first m rows of the product of k Householder reflectors of order n
 
         Q = H(k) * H(k-1) * ... * H(1)
-    
+
     If n <= k, Q is defined as the product of Householder reflectors of order n
 
         Q = H(n-1) * H(n-2) * ... * H(1)
 
-    The Householder matrices H(i) are never stored, they are computed from its corresponding 
+    The Householder matrices H(i) are never stored, they are computed from its corresponding
     Householder vector v(i) and scalar ipiv_i as returned by GEBRD.
 
     @param[in]
@@ -650,12 +650,12 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle,
                 Specifies whether to work column-wise or row-wise.
     @param[in]
     m           rocsolver_int. m >= 0.\n
-                The number of rows of the matrix Q. 
+                The number of rows of the matrix Q.
                 If row-wise, then min(n,k) <= m <= n.
     @param[in]
     n           rocsolver_int. n >= 0.\n
-                The number of colums of the matrix Q. 
-                If column-wise, then min(m,k) <= n <= m. 
+                The number of colums of the matrix Q.
+                If column-wise, then min(m,k) <= n <= m.
     @param[in]
     k           rocsolver_int. k >= 0.\n
                 The number of columns (if storev is colum-wise) or rows (if row-wise) of the
@@ -667,7 +667,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle,
                 On exit, the computed matrix Q.
     @param[in]
     lda         rocsolver_int. lda >= m.\n
-                Specifies the leading dimension of A. 
+                Specifies the leading dimension of A.
     @param[in]
     ipiv        pointer to type. Array on the GPU of dimension min(m,k) if column-wise, or min(n,k) if row-wise.\n
                 The scalar factors of the Householder matrices H(i) as returned by GEBRD.
@@ -677,8 +677,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorglq(rocsolver_handle handle,
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sorgbr(rocsolver_handle handle,
                                                    const rocsolver_storev storev,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    float *ipiv);
@@ -686,8 +686,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sorgbr(rocsolver_handle handle,
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgbr(rocsolver_handle handle,
                                                    const rocsolver_storev storev,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    double *ipiv);
@@ -696,8 +696,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgbr(rocsolver_handle handle,
 
     \details
     (This is the unblocked version of the algorithm).
-    
-    The matrix Q is applied in one of the following forms, depending on 
+
+    The matrix Q is applied in one of the following forms, depending on
     the values of side and trans:
 
         Q  * C  (No transpose from the left)
@@ -709,7 +709,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgbr(rocsolver_handle handle,
 
         Q = H(1) * H(2) * ... * H(k)
 
-    or order m if applying from the left, or n if applying from the right. Q is never stored, it is 
+    or order m if applying from the left, or n if applying from the right. Q is never stored, it is
     calculated from the Householder vectors and scalars returned by the QR factorization GEQRF.
 
     @param[in]
@@ -726,10 +726,10 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgbr(rocsolver_handle handle,
     @param[in]
     n                   rocsolver_int. n >= 0.\n
                         Number of columns of matrix C.
-    @param[in]          
+    @param[in]
     k                   rocsovler_int. k >= 0; k <= m if side is left, k <= n if side is right.\n
                         The number of Householder reflectors that form Q.
-    @param[in]          
+    @param[in]
     A                   pointer to type. Array on the GPU of size lda*k.\n
                         The i-th column has the Householder vector v(i) associated with H(i) as returned by GEQRF
                         in the first k columns of its argument A.
@@ -742,19 +742,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorgbr(rocsolver_handle handle,
     @param[inout]
     C                   pointer to type. Array on the GPU of size ldc*n.\n
                         On input, the matrix C. On output it is overwritten with
-                        Q*C, C*Q, Q'*C, or C*Q'.  
+                        Q*C, C*Q, Q'*C, or C*Q'.
     @param[in]
     lda                 rocsolver_int. ldc >= m.\n
-                        Leading dimension of C. 
-     
+                        Leading dimension of C.
+
     ****************************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sorm2r(rocsolver_handle handle,
                                                    const rocsolver_side side,
                                                    const rocsolver_operation trans,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    float *ipiv,
@@ -765,8 +765,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorm2r(rocsolver_handle handle,
                                                    const rocsolver_side side,
                                                    const rocsolver_operation trans,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    double *ipiv,
@@ -777,8 +777,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorm2r(rocsolver_handle handle,
 
     \details
     (This is the blocked version of the algorithm).
-    
-    The matrix Q is applied in one of the following forms, depending on 
+
+    The matrix Q is applied in one of the following forms, depending on
     the values of side and trans:
 
         Q  * C  (No transpose from the left)
@@ -790,7 +790,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorm2r(rocsolver_handle handle,
 
         Q = H(1) * H(2) * ... * H(k)
 
-    or order m if applying from the left, or n if applying from the right. Q is never stored, it is 
+    or order m if applying from the left, or n if applying from the right. Q is never stored, it is
     calculated from the Householder vectors and scalars returned by the QR factorization GEQRF.
 
     @param[in]
@@ -807,10 +807,10 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorm2r(rocsolver_handle handle,
     @param[in]
     n                   rocsolver_int. n >= 0.\n
                         Number of columns of matrix C.
-    @param[in]          
+    @param[in]
     k                   rocsovler_int. k >= 0; k <= m if side is left, k <= n if side is right.\n
                         The number of Householder reflectors that form Q.
-    @param[in]          
+    @param[in]
     A                   pointer to type. Array on the GPU of size lda*k.\n
                         The i-th column has the Householder vector v(i) associated with H(i) as returned by GEQRF
                         in the first k columns of its argument A.
@@ -823,19 +823,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dorm2r(rocsolver_handle handle,
     @param[inout]
     C                   pointer to type. Array on the GPU of size ldc*n.\n
                         On input, the matrix C. On output it is overwritten with
-                        Q*C, C*Q, Q'*C, or C*Q'.  
+                        Q*C, C*Q, Q'*C, or C*Q'.
     @param[in]
     lda                 rocsolver_int. ldc >= m.\n
-                        Leading dimension of C. 
-     
+                        Leading dimension of C.
+
     ****************************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sormqr(rocsolver_handle handle,
                                                    const rocsolver_side side,
                                                    const rocsolver_operation trans,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    float *ipiv,
@@ -846,8 +846,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dormqr(rocsolver_handle handle,
                                                    const rocsolver_side side,
                                                    const rocsolver_operation trans,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
-                                                   const rocsolver_int k, 
+                                                   const rocsolver_int n,
+                                                   const rocsolver_int k,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    double *ipiv,
@@ -880,10 +880,10 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dormqr(rocsolver_handle handle,
     handle    rocsolver_handle.
     @param[in]
     m         rocsolver_int. m >= 0.\n
-              The number of rows of the matrix A. 
+              The number of rows of the matrix A.
     @param[in]
     n         rocsolver_int. n >= 0.\n
-              The number of colums of the matrix A. 
+              The number of colums of the matrix A.
     @param[inout]
     A         pointer to type. Array on the GPU of dimension lda*n.\n
               On entry, the m-by-n matrix A to be factored.
@@ -891,7 +891,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dormqr(rocsolver_handle handle,
               The unit diagonal elements of L are not stored.
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of A. 
+              Specifies the leading dimension of A.
     @param[out]
     ipiv      pointer to rocsolver_int. Array on the GPU of dimension min(m,n).\n
               The vector of pivot indices. Elements of ipiv are 1-based indices.
@@ -900,14 +900,14 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dormqr(rocsolver_handle handle,
               Matrix P of the factorization can be derived from ipiv.
     @param[out]
     info      pointer to a rocsolver_int on the GPU.\n
-              If info = 0, succesful exit. 
+              If info = 0, succesful exit.
               If info = i > 0, U is singular. U(i,i) is the first zero pivot.
-            
+
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -915,7 +915,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2(rocsolver_handle handle,
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -923,7 +923,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2(rocsolver_handle handle,
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_float_complex *A,
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -931,7 +931,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2(rocsolver_handle handle,
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_double_complex *A,
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -968,8 +968,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2(rocsolver_handle handle,
     lda       rocsolver_int. lda >= m.\n
               Specifies the leading dimension of matrices A_i.
     @param[out]
-    ipiv      pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n 
-              Contains the vectors of pivot indices ipiv_i (corresponding to A_i). 
+    ipiv      pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n
+              Contains the vectors of pivot indices ipiv_i (corresponding to A_i).
               Dimension of ipiv_i is min(m,n).
               Elements of ipiv_i are 1-based indices.
               For each instance A_i in the batch and for 1 <= j <= min(m,n), the row j of the
@@ -981,17 +981,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2(rocsolver_handle handle,
               There is no restriction for the value of strideP. Normal use case is strideP >= min(m,n).
     @param[out]
     info      pointer to rocsolver_int. Array of batch_count integers on the GPU.\n
-              If info_i = 0, succesful exit for factorization of A_i. 
+              If info_i = 0, succesful exit for factorization of A_i.
               If info_i = j > 0, U_i is singular. U_i(j,j) is the first zero pivot.
     @param[in]
     batch_count rocsolver_int. batch_count >= 0.\n
-                Number of matrices in the batch. 
-            
+                Number of matrices in the batch.
+
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    float *const A[],
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1001,7 +1001,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2_batched(rocsolver_handle hand
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    double *const A[],
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1011,7 +1011,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2_batched(rocsolver_handle hand
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_float_complex *const A[],
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1021,7 +1021,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2_batched(rocsolver_handle hand
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_double_complex *const A[],
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1034,7 +1034,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_batched(rocsolver_handle hand
 
     \details
     (This is the right-looking Level 2 BLAS version of the algorithm).
-    
+
     The factorization of matrix A_i in the batch has the form
 
         A_i = P_i * L_i * U_i
@@ -1064,8 +1064,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_batched(rocsolver_handle hand
               Stride from the start of one matrix A_i and the next one A_(i+1).
               There is no restriction for the value of strideA. Normal use case is strideA >= lda*n
     @param[out]
-    ipiv      pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n 
-              Contains the vectors of pivots indices ipiv_i (corresponding to A_i). 
+    ipiv      pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n
+              Contains the vectors of pivots indices ipiv_i (corresponding to A_i).
               Dimension of ipiv_i is min(m,n).
               Elements of ipiv_i are 1-based indices.
               For each instance A_i in the batch and for 1 <= j <= min(m,n), the row j of the
@@ -1077,17 +1077,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_batched(rocsolver_handle hand
               There is no restriction for the value of strideP. Normal use case is strideP >= min(m,n).
     @param[out]
     info      pointer to rocsolver_int. Array of batch_count integers on the GPU.\n
-              If info_i = 0, succesful exit for factorization of A_i. 
+              If info_i = 0, succesful exit for factorization of A_i.
               If info_i = j > 0, U_i is singular. U_i(j,j) is the first zero pivot.
     @param[in]
     batch_count rocsolver_int. batch_count >= 0.\n
-                Number of matrices in the batch. 
-            
+                Number of matrices in the batch.
+
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2_strided_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    const rocsolver_int strideA,
@@ -1098,7 +1098,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetf2_strided_batched(rocsolver_han
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2_strided_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    const rocsolver_int strideA,
@@ -1109,7 +1109,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetf2_strided_batched(rocsolver_han
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2_strided_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_float_complex *A,
                                                    const rocsolver_int lda,
                                                    const rocsolver_int strideA,
@@ -1120,7 +1120,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetf2_strided_batched(rocsolver_han
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_strided_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_double_complex *A,
                                                    const rocsolver_int lda,
                                                    const rocsolver_int strideA,
@@ -1147,10 +1147,10 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_strided_batched(rocsolver_han
     handle    rocsolver_handle.
     @param[in]
     m         rocsolver_int. m >= 0.\n
-              The number of rows of the matrix A. 
+              The number of rows of the matrix A.
     @param[in]
     n         rocsolver_int. n >= 0.\n
-              The number of colums of the matrix A. 
+              The number of colums of the matrix A.
     @param[inout]
     A         pointer to type. Array on the GPU of dimension lda*n.\n
               On entry, the m-by-n matrix A to be factored.
@@ -1158,7 +1158,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_strided_batched(rocsolver_han
               The unit diagonal elements of L are not stored.
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of A. 
+              Specifies the leading dimension of A.
     @param[out]
     ipiv      pointer to rocsolver_int. Array on the GPU of dimension min(m,n).\n
               The vector of pivot indices. Elements of ipiv are 1-based indices.
@@ -1167,14 +1167,14 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetf2_strided_batched(rocsolver_han
               Matrix P of the factorization can be derived from ipiv.
     @param[out]
     info      pointer to a rocsolver_int on the GPU.\n
-              If info = 0, succesful exit. 
+              If info = 0, succesful exit.
               If info = i > 0, U is singular. U(i,i) is the first zero pivot.
-            
+
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1182,7 +1182,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf(rocsolver_handle handle,
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1190,7 +1190,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf(rocsolver_handle handle,
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_float_complex *A,
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1198,7 +1198,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf(rocsolver_handle handle,
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_double_complex *A,
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1235,8 +1235,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf(rocsolver_handle handle,
     lda       rocsolver_int. lda >= m.\n
               Specifies the leading dimension of matrices A_i.
     @param[out]
-    ipiv      pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n 
-              Contains the vectors of pivot indices ipiv_i (corresponding to A_i). 
+    ipiv      pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n
+              Contains the vectors of pivot indices ipiv_i (corresponding to A_i).
               Dimension of ipiv_i is min(m,n).
               Elements of ipiv_i are 1-based indices.
               For each instance A_i in the batch and for 1 <= j <= min(m,n), the row j of the
@@ -1248,17 +1248,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf(rocsolver_handle handle,
               There is no restriction for the value of strideP. Normal use case is strideP >= min(m,n).
     @param[out]
     info      pointer to rocsolver_int. Array of batch_count integers on the GPU.\n
-              If info_i = 0, succesful exit for factorization of A_i. 
+              If info_i = 0, succesful exit for factorization of A_i.
               If info_i = j > 0, U_i is singular. U_i(j,j) is the first zero pivot.
     @param[in]
     batch_count rocsolver_int. batch_count >= 0.\n
-                Number of matrices in the batch. 
-            
+                Number of matrices in the batch.
+
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    float *const A[],
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1268,7 +1268,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf_batched(rocsolver_handle hand
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    double *const A[],
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1278,7 +1278,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf_batched(rocsolver_handle hand
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_float_complex *const A[],
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1288,7 +1288,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf_batched(rocsolver_handle hand
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_double_complex *const A[],
                                                    const rocsolver_int lda,
                                                    rocsolver_int *ipiv,
@@ -1301,7 +1301,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_batched(rocsolver_handle hand
 
     \details
     (This is the right-looking Level 3 BLAS version of the algorithm).
-    
+
     The factorization of matrix A_i in the batch has the form
 
         A_i = P_i * L_i * U_i
@@ -1331,8 +1331,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_batched(rocsolver_handle hand
               Stride from the start of one matrix A_i and the next one A_(i+1).
               There is no restriction for the value of strideA. Normal use case is strideA >= lda*n
     @param[out]
-    ipiv      pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n 
-              Contains the vectors of pivots indices ipiv_i (corresponding to A_i). 
+    ipiv      pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n
+              Contains the vectors of pivots indices ipiv_i (corresponding to A_i).
               Dimension of ipiv_i is min(m,n).
               Elements of ipiv_i are 1-based indices.
               For each instance A_i in the batch and for 1 <= j <= min(m,n), the row j of the
@@ -1344,17 +1344,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_batched(rocsolver_handle hand
               There is no restriction for the value of strideP. Normal use case is strideP >= min(m,n).
     @param[out]
     info      pointer to rocsolver_int. Array of batch_count integers on the GPU.\n
-              If info_i = 0, succesful exit for factorization of A_i. 
+              If info_i = 0, succesful exit for factorization of A_i.
               If info_i = j > 0, U_i is singular. U_i(j,j) is the first zero pivot.
     @param[in]
     batch_count rocsolver_int. batch_count >= 0.\n
-                Number of matrices in the batch. 
-            
+                Number of matrices in the batch.
+
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf_strided_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    float *A,
                                                    const rocsolver_int lda,
                                                    const rocsolver_int strideA,
@@ -1365,7 +1365,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_sgetrf_strided_batched(rocsolver_han
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf_strided_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    double *A,
                                                    const rocsolver_int lda,
                                                    const rocsolver_int strideA,
@@ -1376,7 +1376,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrf_strided_batched(rocsolver_han
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf_strided_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_float_complex *A,
                                                    const rocsolver_int lda,
                                                    const rocsolver_int strideA,
@@ -1387,7 +1387,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrf_strided_batched(rocsolver_han
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_strided_batched(rocsolver_handle handle,
                                                    const rocsolver_int m,
-                                                   const rocsolver_int n, 
+                                                   const rocsolver_int n,
                                                    rocblas_double_complex *A,
                                                    const rocsolver_int lda,
                                                    const rocsolver_int strideA,
@@ -1406,7 +1406,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_strided_batched(rocsolver_han
         A =  Q * [ R ]
                  [ 0 ]
 
-    where R is upper triangular (upper trapezoidal if m < n), and Q is 
+    where R is upper triangular (upper trapezoidal if m < n), and Q is
     a m-by-m orthogonal matrix represented as the product of Householder matrices
 
         Q = H(1) * H(2) * ... * H(k), with k = min(m,n)
@@ -1414,8 +1414,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_strided_batched(rocsolver_han
     Each Householder matrix H(i), for i = 1,2,...,k, is given by
 
         H(i) = I - ipiv[i-1] * v(i) * v(i)'
-    
-    where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. 
+
+    where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -1428,30 +1428,30 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrf_strided_batched(rocsolver_han
     @param[inout]
     A         pointer to type. Array on the GPU of dimension lda*n.\n
               On entry, the m-by-n matrix to be factored.
-              On exit, the elements on and above the diagonal contain the 
+              On exit, the elements on and above the diagonal contain the
               factor R; the elements below the diagonal are the m - i elements
               of vector v(i) for i = 1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of A. 
+              Specifies the leading dimension of A.
     @param[out]
     ipiv      pointer to type. Array on the GPU of dimension min(m,n).\n
               The scalar factors of the Householder matrices H(i).
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2(rocsolver_handle handle, 
-                                                 const rocsolver_int m, 
-                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2(rocsolver_handle handle,
+                                                 const rocsolver_int m,
+                                                 const rocsolver_int n,
                                                  float *A,
-                                                 const rocsolver_int lda, 
+                                                 const rocsolver_int lda,
                                                  float *ipiv);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle, 
-                                                 const rocsolver_int m, 
-                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle,
+                                                 const rocsolver_int m,
+                                                 const rocsolver_int n,
                                                  double *A,
-                                                 const rocsolver_int lda, 
+                                                 const rocsolver_int lda,
                                                  double *ipiv);
 
 /*! \brief GEQR2_BATCHED computes the QR factorization of a batch of general m-by-n matrices.
@@ -1464,7 +1464,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle,
         A_j =  Q_j * [ R_j ]
                      [  0  ]
 
-    where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is 
+    where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is
     a m-by-m orthogonal matrix represented as the product of Householder matrices
 
         Q_j = H_j(1) * H_j(2) * ... * H_j(k), with k = min(m,n)
@@ -1473,7 +1473,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle,
 
         H_j(i) = I - ipiv_j[i-1] * v_j(i) * v_j(i)'
 
-    where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. 
+    where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -1486,19 +1486,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle,
     @param[inout]
     A         Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n
               On entry, the m-by-n matrices A_j to be factored.
-              On exit, the elements on and above the diagonal contain the 
+              On exit, the elements on and above the diagonal contain the
               factor R_j. The elements below the diagonal are the m - i elements
               of vector v_j(i) for i=1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of matrices A_j. 
+              Specifies the leading dimension of matrices A_j.
     @param[out]
     ipiv      pointer to type. Array on the GPU (the size depends on the value of strideP).\n
-              Contains the vectors ipiv_j of scalar factors of the 
+              Contains the vectors ipiv_j of scalar factors of the
               Householder matrices H_j(i).
     @param[in]
     strideP   rocsolver_int.\n
-              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). 
+              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1).
               There is no restriction for the value
               of strideP. Normal use is strideP >= min(m,n).
     @param[in]
@@ -1507,22 +1507,22 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2(rocsolver_handle handle,
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2_batched(rocsolver_handle handle, 
-                                                         const rocsolver_int m, 
-                                                         const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2_batched(rocsolver_handle handle,
+                                                         const rocsolver_int m,
+                                                         const rocsolver_int n,
                                                          float *const A[],
-                                                         const rocsolver_int lda, 
-                                                         float *ipiv, 
-                                                         const rocsolver_int strideP, 
+                                                         const rocsolver_int lda,
+                                                         float *ipiv,
+                                                         const rocsolver_int strideP,
                                                          const rocsolver_int batch_count);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle handle, 
-                                                         const rocsolver_int m, 
-                                                         const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle handle,
+                                                         const rocsolver_int m,
+                                                         const rocsolver_int n,
                                                          double *const A[],
-                                                         const rocsolver_int lda, 
-                                                         double *ipiv, 
-                                                         const rocsolver_int strideP, 
+                                                         const rocsolver_int lda,
+                                                         double *ipiv,
+                                                         const rocsolver_int strideP,
                                                          const rocsolver_int batch_count);
 
 /*! \brief GEQR2_STRIDED_BATCHED computes the QR factorization of a batch of general m-by-n matrices.
@@ -1533,9 +1533,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle hand
     The factorization of matrix A_j in the batch has the form
 
         A_j =  Q_j * [ R_j ]
-                     [  0  ] 
+                     [  0  ]
 
-    where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is 
+    where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is
     a m-by-m orthogonal matrix represented as the product of Householder matrices
 
         Q_j = H_j(1) * H_j(2) * ... * H_j(k), with k = min(m,n)
@@ -1544,7 +1544,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle hand
 
         H_j(i) = I - ipiv_j[i-1] * v_j(i) * v_j(i)'
 
-    where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. 
+    where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -1557,23 +1557,23 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle hand
     @param[inout]
     A         pointer to type. Array on the GPU (the size depends on the value of strideA).\n
               On entry, the m-by-n matrices A_j to be factored.
-              On exit, the elements on and above the diagonal contain the 
+              On exit, the elements on and above the diagonal contain the
               factor R_j. The elements below the diagonal are the m - i elements
               of vector v_j(i) for i = 1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of matrices A_j. 
+              Specifies the leading dimension of matrices A_j.
     @param[in]
-    strideA   rocsolver_int.\n   
-              Stride from the start of one matrix A_j and the next one A_(j+1). 
+    strideA   rocsolver_int.\n
+              Stride from the start of one matrix A_j and the next one A_(j+1).
               There is no restriction for the value of strideA. Normal use case is strideA >= lda*n.
     @param[out]
     ipiv      pointer to type. Array on the GPU (the size depends on the value of strideP).\n
-              Contains the vectors ipiv_j of scalar factors of the 
+              Contains the vectors ipiv_j of scalar factors of the
               Householder matrices H_j(i).
     @param[in]
     strideP   rocsolver_int.\n
-              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). 
+              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1).
               There is no restriction for the value
               of strideP. Normal use is strideP >= min(m,n).
     @param[in]
@@ -1582,24 +1582,24 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_batched(rocsolver_handle hand
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2_strided_batched(rocsolver_handle handle, 
-                                                                 const rocsolver_int m, 
-                                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqr2_strided_batched(rocsolver_handle handle,
+                                                                 const rocsolver_int m,
+                                                                 const rocsolver_int n,
                                                                  float *A,
-                                                                 const rocsolver_int lda, 
-                                                                 const rocsolver_int strideA, 
-                                                                 float *ipiv, 
-                                                                 const rocsolver_int strideP, 
+                                                                 const rocsolver_int lda,
+                                                                 const rocsolver_int strideA,
+                                                                 float *ipiv,
+                                                                 const rocsolver_int strideP,
                                                                  const rocsolver_int batch_count);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_strided_batched(rocsolver_handle handle, 
-                                                                 const rocsolver_int m, 
-                                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_strided_batched(rocsolver_handle handle,
+                                                                 const rocsolver_int m,
+                                                                 const rocsolver_int n,
                                                                  double *A,
-                                                                 const rocsolver_int lda, 
-                                                                 const rocsolver_int strideA, 
-                                                                 double *ipiv, 
-                                                                 const rocsolver_int strideP, 
+                                                                 const rocsolver_int lda,
+                                                                 const rocsolver_int strideA,
+                                                                 double *ipiv,
+                                                                 const rocsolver_int strideP,
                                                                  const rocsolver_int batch_count);
 
 /*! \brief GELQ2 computes a LQ factorization of a general m-by-n matrix A.
@@ -1610,8 +1610,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_strided_batched(rocsolver_han
     The factorization has the form
 
         A = [ L 0 ] * Q
- 
-    where L is lower triangular (lower trapezoidal if m > n), and Q is 
+
+    where L is lower triangular (lower trapezoidal if m > n), and Q is
     a n-by-n orthogonal matrix represented as the product of Householder matrices
 
         Q = H(k) * H(k-1) * ... * H(1), with k = min(m,n)
@@ -1619,8 +1619,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_strided_batched(rocsolver_han
     Each Householder matrix H(i), for i = 1,2,...,k, is given by
 
         H(i) = I - ipiv[i-1] * v(i)' * v(i)
-    
-    where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. 
+
+    where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -1633,30 +1633,30 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqr2_strided_batched(rocsolver_han
     @param[inout]
     A         pointer to type. Array on the GPU of dimension lda*n.\n
               On entry, the m-by-n matrix to be factored.
-              On exit, the elements on and delow the diagonal contain the 
+              On exit, the elements on and delow the diagonal contain the
               factor L; the elements above the diagonal are the n - i elements
               of vector v(i) for i = 1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of A. 
+              Specifies the leading dimension of A.
     @param[out]
     ipiv      pointer to type. Array on the GPU of dimension min(m,n).\n
               The scalar factors of the Householder matrices H(i).
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2(rocsolver_handle handle, 
-                                                 const rocsolver_int m, 
-                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2(rocsolver_handle handle,
+                                                 const rocsolver_int m,
+                                                 const rocsolver_int n,
                                                  float *A,
-                                                 const rocsolver_int lda, 
+                                                 const rocsolver_int lda,
                                                  float *ipiv);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle, 
-                                                 const rocsolver_int m, 
-                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle,
+                                                 const rocsolver_int m,
+                                                 const rocsolver_int n,
                                                  double *A,
-                                                 const rocsolver_int lda, 
+                                                 const rocsolver_int lda,
                                                  double *ipiv);
 
 /*! \brief GELQ2_BATCHED computes the LQ factorization of a batch of general m-by-n matrices.
@@ -1666,9 +1666,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle,
 
     The factorization of matrix A_j in the batch has the form
 
-        A_j = [ L_j 0 ] * Q_j 
+        A_j = [ L_j 0 ] * Q_j
 
-    where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is 
+    where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is
     a n-by-n orthogonal matrix represented as the product of Householder matrices
 
         Q_j = H_j(k) * H_j(k-1) * ... * H_j(1), with k = min(m,n)
@@ -1677,7 +1677,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle,
 
         H_j(i) = I - ipiv_j[i-1] * v_j(i)' * v_j(i)
 
-    where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. 
+    where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -1690,19 +1690,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle,
     @param[inout]
     A         Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n
               On entry, the m-by-n matrices A_j to be factored.
-              On exit, the elements on and below the diagonal contain the 
+              On exit, the elements on and below the diagonal contain the
               factor L_j. The elements above the diagonal are the n - i elements
               of vector v_j(i) for i=1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of matrices A_j. 
+              Specifies the leading dimension of matrices A_j.
     @param[out]
     ipiv      pointer to type. Array on the GPU (the size depends on the value of strideP).\n
-              Contains the vectors ipiv_j of scalar factors of the 
+              Contains the vectors ipiv_j of scalar factors of the
               Householder matrices H_j(i).
     @param[in]
     strideP   rocsolver_int.\n
-              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). 
+              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1).
               There is no restriction for the value
               of strideP. Normal use is strideP >= min(m,n).
     @param[in]
@@ -1711,22 +1711,22 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2(rocsolver_handle handle,
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2_batched(rocsolver_handle handle, 
-                                                         const rocsolver_int m, 
-                                                         const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2_batched(rocsolver_handle handle,
+                                                         const rocsolver_int m,
+                                                         const rocsolver_int n,
                                                          float *const A[],
-                                                         const rocsolver_int lda, 
-                                                         float *ipiv, 
-                                                         const rocsolver_int strideP, 
+                                                         const rocsolver_int lda,
+                                                         float *ipiv,
+                                                         const rocsolver_int strideP,
                                                          const rocsolver_int batch_count);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle handle, 
-                                                         const rocsolver_int m, 
-                                                         const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle handle,
+                                                         const rocsolver_int m,
+                                                         const rocsolver_int n,
                                                          double *const A[],
-                                                         const rocsolver_int lda, 
-                                                         double *ipiv, 
-                                                         const rocsolver_int strideP, 
+                                                         const rocsolver_int lda,
+                                                         double *ipiv,
+                                                         const rocsolver_int strideP,
                                                          const rocsolver_int batch_count);
 
 /*! \brief GELQ2_STRIDED_BATCHED computes the LQ factorization of a batch of general m-by-n matrices.
@@ -1736,9 +1736,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle hand
 
     The factorization of matrix A_j in the batch has the form
 
-        A_j = [ L_j 0 ] * Q_j 
+        A_j = [ L_j 0 ] * Q_j
 
-    where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is 
+    where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is
     a n-by-n orthogonal matrix represented as the product of Householder matrices
 
         Q_j = H_j(k) * H_j(k-1) * ... * H_j(1), with k = min(m,n)
@@ -1747,7 +1747,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle hand
 
         H_j(i) = I - ipiv_j[i-1] * v_j(i)' * v_j(i)
 
-    where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. 
+    where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -1760,23 +1760,23 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle hand
     @param[inout]
     A         pointer to type. Array on the GPU (the size depends on the value of strideA).\n
               On entry, the m-by-n matrices A_j to be factored.
-              On exit, the elements on and below the diagonal contain the 
+              On exit, the elements on and below the diagonal contain the
               factor L_j. The elements above the diagonal are the n - i elements
               of vector v_j(i) for i = 1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of matrices A_j. 
+              Specifies the leading dimension of matrices A_j.
     @param[in]
-    strideA   rocsolver_int.\n   
-              Stride from the start of one matrix A_j and the next one A_(j+1). 
+    strideA   rocsolver_int.\n
+              Stride from the start of one matrix A_j and the next one A_(j+1).
               There is no restriction for the value of strideA. Normal use case is strideA >= lda*n.
     @param[out]
     ipiv      pointer to type. Array on the GPU (the size depends on the value of strideP).\n
-              Contains the vectors ipiv_j of scalar factors of the 
+              Contains the vectors ipiv_j of scalar factors of the
               Householder matrices H_j(i).
     @param[in]
     strideP   rocsolver_int.\n
-              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). 
+              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1).
               There is no restriction for the value
               of strideP. Normal use is strideP >= min(m,n).
     @param[in]
@@ -1785,24 +1785,24 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_batched(rocsolver_handle hand
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2_strided_batched(rocsolver_handle handle, 
-                                                                 const rocsolver_int m, 
-                                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelq2_strided_batched(rocsolver_handle handle,
+                                                                 const rocsolver_int m,
+                                                                 const rocsolver_int n,
                                                                  float *A,
-                                                                 const rocsolver_int lda, 
-                                                                 const rocsolver_int strideA, 
-                                                                 float *ipiv, 
-                                                                 const rocsolver_int strideP, 
+                                                                 const rocsolver_int lda,
+                                                                 const rocsolver_int strideA,
+                                                                 float *ipiv,
+                                                                 const rocsolver_int strideP,
                                                                  const rocsolver_int batch_count);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_strided_batched(rocsolver_handle handle, 
-                                                                 const rocsolver_int m, 
-                                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_strided_batched(rocsolver_handle handle,
+                                                                 const rocsolver_int m,
+                                                                 const rocsolver_int n,
                                                                  double *A,
-                                                                 const rocsolver_int lda, 
-                                                                 const rocsolver_int strideA, 
-                                                                 double *ipiv, 
-                                                                 const rocsolver_int strideP, 
+                                                                 const rocsolver_int lda,
+                                                                 const rocsolver_int strideA,
+                                                                 double *ipiv,
+                                                                 const rocsolver_int strideP,
                                                                  const rocsolver_int batch_count);
 
 
@@ -1815,8 +1815,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_strided_batched(rocsolver_han
 
         A =  Q * [ R ]
                  [ 0 ]
- 
-    where R is upper triangular (upper trapezoidal if m < n), and Q is 
+
+    where R is upper triangular (upper trapezoidal if m < n), and Q is
     a m-by-m orthogonal matrix represented as the product of Householder matrices
 
         Q = H(1) * H(2) * ... * H(k), with k = min(m,n)
@@ -1824,8 +1824,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_strided_batched(rocsolver_han
     Each Householder matrix H(i), for i = 1,2,...,k, is given by
 
         H(i) = I - ipiv[i-1] * v(i) * v(i)'
-    
-    where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. 
+
+    where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -1838,30 +1838,30 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelq2_strided_batched(rocsolver_han
     @param[inout]
     A         pointer to type. Array on the GPU of dimension lda*n.\n
               On entry, the m-by-n matrix to be factored.
-              On exit, the elements on and above the diagonal contain the 
+              On exit, the elements on and above the diagonal contain the
               factor R; the elements below the diagonal are the m - i elements
               of vector v(i) for i = 1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of A. 
+              Specifies the leading dimension of A.
     @param[out]
     ipiv      pointer to type. Array on the GPU of dimension min(m,n).\n
               The scalar factors of the Householder matrices H(i).
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf(rocsolver_handle handle, 
-                                                 const rocsolver_int m, 
-                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf(rocsolver_handle handle,
+                                                 const rocsolver_int m,
+                                                 const rocsolver_int n,
                                                  float *A,
-                                                 const rocsolver_int lda, 
+                                                 const rocsolver_int lda,
                                                  float *ipiv);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle, 
-                                                 const rocsolver_int m, 
-                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle,
+                                                 const rocsolver_int m,
+                                                 const rocsolver_int n,
                                                  double *A,
-                                                 const rocsolver_int lda, 
+                                                 const rocsolver_int lda,
                                                  double *ipiv);
 
 /*! \brief GEQRF_BATCHED computes the QR factorization of a batch of general m-by-n matrices.
@@ -1872,9 +1872,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle,
     The factorization of matrix A_j in the batch has the form
 
         A_j =  Q_j * [ R_j ]
-                     [  0  ] 
+                     [  0  ]
 
-    where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is 
+    where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is
     a m-by-m orthogonal matrix represented as the product of Householder matrices
 
         Q_j = H_j(1) * H_j(2) * ... * H_j(k), with k = min(m,n)
@@ -1883,7 +1883,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle,
 
         H_j(i) = I - ipiv_j[i-1] * v_j(i) * v_j(i)'
 
-    where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. 
+    where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -1896,19 +1896,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle,
     @param[inout]
     A         Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n
               On entry, the m-by-n matrices A_j to be factored.
-              On exit, the elements on and above the diagonal contain the 
+              On exit, the elements on and above the diagonal contain the
               factor R_j. The elements below the diagonal are the m - i elements
               of vector v_j(i) for i=1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of matrices A_j. 
+              Specifies the leading dimension of matrices A_j.
     @param[out]
     ipiv      pointer to type. Array on the GPU (the size depends on the value of strideP).\n
-              Contains the vectors ipiv_j of scalar factors of the 
+              Contains the vectors ipiv_j of scalar factors of the
               Householder matrices H_j(i).
     @param[in]
     strideP   rocsolver_int.\n
-              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). 
+              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1).
               There is no restriction for the value
               of strideP. Normal use is strideP >= min(m,n).
     @param[in]
@@ -1917,22 +1917,22 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf(rocsolver_handle handle,
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf_batched(rocsolver_handle handle, 
-                                                         const rocsolver_int m, 
-                                                         const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf_batched(rocsolver_handle handle,
+                                                         const rocsolver_int m,
+                                                         const rocsolver_int n,
                                                          float *const A[],
-                                                         const rocsolver_int lda, 
-                                                         float *ipiv, 
-                                                         const rocsolver_int strideP, 
+                                                         const rocsolver_int lda,
+                                                         float *ipiv,
+                                                         const rocsolver_int strideP,
                                                          const rocsolver_int batch_count);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle handle, 
-                                                         const rocsolver_int m, 
-                                                         const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle handle,
+                                                         const rocsolver_int m,
+                                                         const rocsolver_int n,
                                                          double *const A[],
-                                                         const rocsolver_int lda, 
-                                                         double *ipiv, 
-                                                         const rocsolver_int strideP, 
+                                                         const rocsolver_int lda,
+                                                         double *ipiv,
+                                                         const rocsolver_int strideP,
                                                          const rocsolver_int batch_count);
 
 /*! \brief GEQRF_STRIDED_BATCHED computes the QR factorization of a batch of general m-by-n matrices.
@@ -1943,9 +1943,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle hand
     The factorization of matrix A_j in the batch has the form
 
         A_j =  Q_j * [ R_j ]
-                     [  0  ] 
+                     [  0  ]
 
-    where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is 
+    where R_j is upper triangular (upper trapezoidal if m < n), and Q_j is
     a m-by-m orthogonal matrix represented as the product of Householder matrices
 
         Q_j = H_j(1) * H_j(2) * ... * H_j(k), with k = min(m,n)
@@ -1954,7 +1954,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle hand
 
         H_j(i) = I - ipiv_j[i-1] * v_j(i) * v_j(i)'
 
-    where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. 
+    where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -1967,23 +1967,23 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle hand
     @param[inout]
     A         pointer to type. Array on the GPU (the size depends on the value of strideA).\n
               On entry, the m-by-n matrices A_j to be factored.
-              On exit, the elements on and above the diagonal contain the 
+              On exit, the elements on and above the diagonal contain the
               factor R_j. The elements below the diagonal are the m - i elements
               of vector v_j(i) for i = 1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of matrices A_j. 
+              Specifies the leading dimension of matrices A_j.
     @param[in]
-    strideA   rocsolver_int.\n   
-              Stride from the start of one matrix A_j and the next one A_(j+1). 
+    strideA   rocsolver_int.\n
+              Stride from the start of one matrix A_j and the next one A_(j+1).
               There is no restriction for the value of strideA. Normal use case is strideA >= lda*n.
     @param[out]
     ipiv      pointer to type. Array on the GPU (the size depends on the value of strideP).\n
-              Contains the vectors ipiv_j of scalar factors of the 
+              Contains the vectors ipiv_j of scalar factors of the
               Householder matrices H_j(i).
     @param[in]
     strideP   rocsolver_int.\n
-              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). 
+              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1).
               There is no restriction for the value
               of strideP. Normal use is strideP >= min(m,n).
     @param[in]
@@ -1992,24 +1992,24 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_batched(rocsolver_handle hand
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf_strided_batched(rocsolver_handle handle, 
-                                                                 const rocsolver_int m, 
-                                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgeqrf_strided_batched(rocsolver_handle handle,
+                                                                 const rocsolver_int m,
+                                                                 const rocsolver_int n,
                                                                  float *A,
-                                                                 const rocsolver_int lda, 
-                                                                 const rocsolver_int strideA, 
-                                                                 float *ipiv, 
-                                                                 const rocsolver_int strideP, 
+                                                                 const rocsolver_int lda,
+                                                                 const rocsolver_int strideA,
+                                                                 float *ipiv,
+                                                                 const rocsolver_int strideP,
                                                                  const rocsolver_int batch_count);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_strided_batched(rocsolver_handle handle, 
-                                                                 const rocsolver_int m, 
-                                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_strided_batched(rocsolver_handle handle,
+                                                                 const rocsolver_int m,
+                                                                 const rocsolver_int n,
                                                                  double *A,
-                                                                 const rocsolver_int lda, 
-                                                                 const rocsolver_int strideA, 
-                                                                 double *ipiv, 
-                                                                 const rocsolver_int strideP, 
+                                                                 const rocsolver_int lda,
+                                                                 const rocsolver_int strideA,
+                                                                 double *ipiv,
+                                                                 const rocsolver_int strideP,
                                                                  const rocsolver_int batch_count);
 
 /*! \brief GELQF computes a LQ factorization of a general m-by-n matrix A.
@@ -2020,8 +2020,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_strided_batched(rocsolver_han
     The factorization has the form
 
         A = [ L 0 ] * Q
- 
-    where L is lower triangular (lower trapezoidal if m > n), and Q is 
+
+    where L is lower triangular (lower trapezoidal if m > n), and Q is
     a n-by-n orthogonal matrix represented as the product of Householder matrices
 
         Q = H(k) * H(k-1) * ... * H(1), with k = min(m,n)
@@ -2029,8 +2029,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_strided_batched(rocsolver_han
     Each Householder matrix H(i), for i = 1,2,...,k, is given by
 
         H(i) = I - ipiv[i-1] * v(i)' * v(i)
-    
-    where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1. 
+
+    where the first i-1 elements of the Householder vector v(i) are zero, and v(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -2043,30 +2043,30 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgeqrf_strided_batched(rocsolver_han
     @param[inout]
     A         pointer to type. Array on the GPU of dimension lda*n.\n
               On entry, the m-by-n matrix to be factored.
-              On exit, the elements on and delow the diagonal contain the 
+              On exit, the elements on and delow the diagonal contain the
               factor L; the elements above the diagonal are the n - i elements
               of vector v(i) for i = 1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of A. 
+              Specifies the leading dimension of A.
     @param[out]
     ipiv      pointer to type. Array on the GPU of dimension min(m,n).\n
               The scalar factors of the Householder matrices H(i).
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf(rocsolver_handle handle, 
-                                                 const rocsolver_int m, 
-                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf(rocsolver_handle handle,
+                                                 const rocsolver_int m,
+                                                 const rocsolver_int n,
                                                  float *A,
-                                                 const rocsolver_int lda, 
+                                                 const rocsolver_int lda,
                                                  float *ipiv);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle, 
-                                                 const rocsolver_int m, 
-                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle,
+                                                 const rocsolver_int m,
+                                                 const rocsolver_int n,
                                                  double *A,
-                                                 const rocsolver_int lda, 
+                                                 const rocsolver_int lda,
                                                  double *ipiv);
 
 /*! \brief GELQF_BATCHED computes the LQ factorization of a batch of general m-by-n matrices.
@@ -2076,9 +2076,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle,
 
     The factorization of matrix A_j in the batch has the form
 
-        A_j = [ L_j 0 ] * Q_j 
+        A_j = [ L_j 0 ] * Q_j
 
-    where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is 
+    where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is
     a n-by-n orthogonal matrix represented as the product of Householder matrices
 
         Q_j = H_j(k) * H_j(k-1) * ... * H_j(1), with k = min(m,n)
@@ -2087,7 +2087,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle,
 
         H_j(i) = I - ipiv_j[i-1] * v_j(i)' * v_j(i)
 
-    where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1. 
+    where the first i-1 elements of Householder vector v_j(i) are zero, and v_j(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -2100,19 +2100,19 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle,
     @param[inout]
     A         Array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n
               On entry, the m-by-n matrices A_j to be factored.
-              On exit, the elements on and below the diagonal contain the 
+              On exit, the elements on and below the diagonal contain the
               factor L_j. The elements above the diagonal are the n - i elements
               of vector v_j(i) for i=1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of matrices A_j. 
+              Specifies the leading dimension of matrices A_j.
     @param[out]
     ipiv      pointer to type. Array on the GPU (the size depends on the value of strideP).\n
-              Contains the vectors ipiv_j of scalar factors of the 
+              Contains the vectors ipiv_j of scalar factors of the
               Householder matrices H_j(i).
     @param[in]
     strideP   rocsolver_int.\n
-              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). 
+              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1).
               There is no restriction for the value
               of strideP. Normal use is strideP >= min(m,n).
     @param[in]
@@ -2121,22 +2121,22 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf(rocsolver_handle handle,
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf_batched(rocsolver_handle handle, 
-                                                         const rocsolver_int m, 
-                                                         const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf_batched(rocsolver_handle handle,
+                                                         const rocsolver_int m,
+                                                         const rocsolver_int n,
                                                          float *const A[],
-                                                         const rocsolver_int lda, 
-                                                         float *ipiv, 
-                                                         const rocsolver_int strideP, 
+                                                         const rocsolver_int lda,
+                                                         float *ipiv,
+                                                         const rocsolver_int strideP,
                                                          const rocsolver_int batch_count);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle handle, 
-                                                         const rocsolver_int m, 
-                                                         const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle handle,
+                                                         const rocsolver_int m,
+                                                         const rocsolver_int n,
                                                          double *const A[],
-                                                         const rocsolver_int lda, 
-                                                         double *ipiv, 
-                                                         const rocsolver_int strideP, 
+                                                         const rocsolver_int lda,
+                                                         double *ipiv,
+                                                         const rocsolver_int strideP,
                                                          const rocsolver_int batch_count);
 
 /*! \brief GELQF_STRIDED_BATCHED computes the LQ factorization of a batch of general m-by-n matrices.
@@ -2146,9 +2146,9 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle hand
 
     The factorization of matrix A_j in the batch has the form
 
-        A_j = [ L_j 0 ] * Q_j 
+        A_j = [ L_j 0 ] * Q_j
 
-    where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is 
+    where L_j is lower triangular (lower trapezoidal if m > n), and Q_j is
     a n-by-n orthogonal matrix represented as the product of Householder matrices
 
         Q_j = H_j(k) * H_j(k-1) * ... * H_j(1), with k = min(m,n)
@@ -2157,7 +2157,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle hand
 
         H_j(i) = I - ipiv_j[i-1] * v_j(i)' * v_j(i)
 
-    where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1. 
+    where the first i-1 elements of vector Householder vector v_j(i) are zero, and v_j(i)[i] = 1.
 
     @param[in]
     handle    rocsolver_handle.
@@ -2170,23 +2170,23 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle hand
     @param[inout]
     A         pointer to type. Array on the GPU (the size depends on the value of strideA).\n
               On entry, the m-by-n matrices A_j to be factored.
-              On exit, the elements on and below the diagonal contain the 
+              On exit, the elements on and below the diagonal contain the
               factor L_j. The elements above the diagonal are the n - i elements
               of vector v_j(i) for i = 1,2,...,min(m,n).
     @param[in]
     lda       rocsolver_int. lda >= m.\n
-              Specifies the leading dimension of matrices A_j. 
+              Specifies the leading dimension of matrices A_j.
     @param[in]
-    strideA   rocsolver_int.\n   
-              Stride from the start of one matrix A_j and the next one A_(j+1). 
+    strideA   rocsolver_int.\n
+              Stride from the start of one matrix A_j and the next one A_(j+1).
               There is no restriction for the value of strideA. Normal use case is strideA >= lda*n.
     @param[out]
     ipiv      pointer to type. Array on the GPU (the size depends on the value of strideP).\n
-              Contains the vectors ipiv_j of scalar factors of the 
+              Contains the vectors ipiv_j of scalar factors of the
               Householder matrices H_j(i).
     @param[in]
     strideP   rocsolver_int.\n
-              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1). 
+              Stride from the start of one vector ipiv_j to the next one ipiv_(j+1).
               There is no restriction for the value
               of strideP. Normal use is strideP >= min(m,n).
     @param[in]
@@ -2195,46 +2195,46 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_batched(rocsolver_handle hand
 
     ********************************************************************/
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf_strided_batched(rocsolver_handle handle, 
-                                                                 const rocsolver_int m, 
-                                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_sgelqf_strided_batched(rocsolver_handle handle,
+                                                                 const rocsolver_int m,
+                                                                 const rocsolver_int n,
                                                                  float *A,
-                                                                 const rocsolver_int lda, 
-                                                                 const rocsolver_int strideA, 
-                                                                 float *ipiv, 
-                                                                 const rocsolver_int strideP, 
+                                                                 const rocsolver_int lda,
+                                                                 const rocsolver_int strideA,
+                                                                 float *ipiv,
+                                                                 const rocsolver_int strideP,
                                                                  const rocsolver_int batch_count);
 
-ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_strided_batched(rocsolver_handle handle, 
-                                                                 const rocsolver_int m, 
-                                                                 const rocsolver_int n, 
+ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_strided_batched(rocsolver_handle handle,
+                                                                 const rocsolver_int m,
+                                                                 const rocsolver_int n,
                                                                  double *A,
-                                                                 const rocsolver_int lda, 
-                                                                 const rocsolver_int strideA, 
-                                                                 double *ipiv, 
-                                                                 const rocsolver_int strideP, 
+                                                                 const rocsolver_int lda,
+                                                                 const rocsolver_int strideA,
+                                                                 double *ipiv,
+                                                                 const rocsolver_int strideP,
                                                                  const rocsolver_int batch_count);
 
 
 /*! \brief GETRS solves a system of n linear equations on n variables using the LU factorization computed by GETRF.
 
     \details
-    It solves one of the following systems: 
+    It solves one of the following systems:
 
-        A  * X = B (no transpose),  
-        A' * X = B (transpose),  or  
+        A  * X = B (no transpose),
+        A' * X = B (transpose),  or
         A* * X = B (conjugate transpose)
 
-    depending on the value of trans. 
+    depending on the value of trans.
 
     @param[in]
     handle      rocsolver_handle.
     @param[in]
     trans       rocsolver_operation.\n
-                Specifies the form of the system of equations. 
+                Specifies the form of the system of equations.
     @param[in]
     n           rocsolver_int. n >= 0.\n
-                The order of the system, i.e. the number of columns and rows of A.  
+                The order of the system, i.e. the number of columns and rows of A.
     @param[in]
     nrhs        rocsolver_int. nrhs >= 0.\n
                 The number of right hand sides, i.e., the number of columns
@@ -2244,7 +2244,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgelqf_strided_batched(rocsolver_han
                 The factors L and U of the factorization A = P*L*U returned by GETRF.
     @param[in]
     lda         rocsolver_int. lda >= n.\n
-                The leading dimension of A.  
+                The leading dimension of A.
     @param[in]
     ipiv        pointer to rocsolver_int. Array on the GPU of dimension n.\n
                 The pivot indices returned by GETRF.
@@ -2278,26 +2278,26 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs(
     const rocsolver_int nrhs, rocblas_double_complex *A, const rocsolver_int lda,
     const rocsolver_int *ipiv, rocblas_double_complex *B, const rocsolver_int ldb);
 
-/*! \brief GETRS_BATCHED solves a batch of systems of n linear equations on n variables 
+/*! \brief GETRS_BATCHED solves a batch of systems of n linear equations on n variables
      using the LU factorization computed by GETRF_BATCHED.
 
     \details
-    For each instance j in the batch, it solves one of the following systems: 
+    For each instance j in the batch, it solves one of the following systems:
 
-        A_j  * X_j = B_j (no transpose),  
-        A_j' * X_j = B_j (transpose),  or  
+        A_j  * X_j = B_j (no transpose),
+        A_j' * X_j = B_j (transpose),  or
         A_j* * X_j = B_j (conjugate transpose)
 
-    depending on the value of trans. 
+    depending on the value of trans.
 
     @param[in]
     handle      rocsolver_handle.
     @param[in]
     trans       rocsolver_operation.\n
-                Specifies the form of the system of equations of each instance in the batch. 
+                Specifies the form of the system of equations of each instance in the batch.
     @param[in]
     n           rocsolver_int. n >= 0.\n
-                The order of the system, i.e. the number of columns and rows of all A_j matrices.  
+                The order of the system, i.e. the number of columns and rows of all A_j matrices.
     @param[in]
     nrhs        rocsolver_int. nrhs >= 0.\n
                 The number of right hand sides, i.e., the number of columns
@@ -2312,7 +2312,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs(
     ipiv        pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n
                 Contains the vectors ipiv_j of pivot indices returned by GETRF_BATCHED.
     @param[in,out]
-    B           Array of pointers to type. Each pointer points to an array on the GPU of dimension ldb*nrhs.\n 
+    B           Array of pointers to type. Each pointer points to an array on the GPU of dimension ldb*nrhs.\n
                 On entry, the right hand side matrices B_j.
                 On exit, the solution matrix X_j of each system in the batch.
     @param[in]
@@ -2320,7 +2320,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs(
                 The leading dimension of matrices B_j.
     @param[in]
     batch_count rocsolver_int. batch_count >= 0.\n
-                Number of instances (systems) in the batch. 
+                Number of instances (systems) in the batch.
 
    ********************************************************************/
 
@@ -2337,35 +2337,35 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrs_batched(
 ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrs_batched(
                  rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, rocblas_float_complex *const A[], const rocblas_int lda,
-                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *const B[], 
+                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *const B[],
                  const rocblas_int ldb, const rocblas_int batch_count);
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_batched(
                  rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, rocblas_double_complex *const A[], const rocblas_int lda,
-                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *const B[], 
+                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *const B[],
                  const rocblas_int ldb, const rocblas_int batch_count);
 
-/*! \brief GETRS_STRIDED_BATCHED solves a batch of systems of n linear equations on n variables 
+/*! \brief GETRS_STRIDED_BATCHED solves a batch of systems of n linear equations on n variables
      using the LU factorization computed by GETRF_STRIDED_BATCHED.
 
     \details
-    For each instance j in the batch, it solves one of the following systems: 
+    For each instance j in the batch, it solves one of the following systems:
 
-        A_j  * X_j = B_j (no transpose),  
-        A_j' * X_j = B_j (transpose),  or  
+        A_j  * X_j = B_j (no transpose),
+        A_j' * X_j = B_j (transpose),  or
         A_j* * X_j = B_j (conjugate transpose)
 
-    depending on the value of trans. 
+    depending on the value of trans.
 
     @param[in]
     handle      rocsolver_handle.
     @param[in]
     trans       rocsolver_operation.\n
-                Specifies the form of the system of equations of each instance in the batch. 
+                Specifies the form of the system of equations of each instance in the batch.
     @param[in]
     n           rocsolver_int. n >= 0.\n
-                The order of the system, i.e. the number of columns and rows of all A_j matrices.  
+                The order of the system, i.e. the number of columns and rows of all A_j matrices.
     @param[in]
     nrhs        rocsolver_int. nrhs >= 0.\n
                 The number of right hand sides, i.e., the number of columns
@@ -2378,7 +2378,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_batched(
                 The leading dimension of matrices A_j.
     @param[in]
     strideA     rocsolver_int.\n
-                Stride from the start of one matrix A_j and the next one A_(j+1). 
+                Stride from the start of one matrix A_j and the next one A_(j+1).
                 There is no restriction for the value of strideA. Normal use case is strideA >= lda*n.
     @param[in]
     ipiv        pointer to rocsolver_int. Array on the GPU (the size depends on the value of strideP).\n
@@ -2392,11 +2392,11 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_batched(
                 The leading dimension of matrices B_j.
     @param[in]
     strideB     rocsolver_int.\n
-                Stride from the start of one matrix B_j and the next one B_(j+1). 
+                Stride from the start of one matrix B_j and the next one B_(j+1).
                 There is no restriction for the value of strideB. Normal use case is strideB >= ldb*nrhs.
     @param[in]
     batch_count rocsolver_int. batch_count >= 0.\n
-                Number of instances (systems) in the batch. 
+                Number of instances (systems) in the batch.
 
    ********************************************************************/
 
@@ -2413,13 +2413,13 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dgetrs_strided_batched(
 ROCSOLVER_EXPORT rocsolver_status rocsolver_cgetrs_strided_batched(
                  rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, rocblas_float_complex *A, const rocblas_int lda, const rocblas_int strideA,
-                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *B, const rocblas_int ldb, 
+                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *B, const rocblas_int ldb,
                  const rocblas_int strideB, const rocblas_int batch_count);
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_strided_batched(
                  rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, rocblas_double_complex *A, const rocblas_int lda, const rocblas_int strideA,
-                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *B, const rocblas_int ldb, 
+                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *B, const rocblas_int ldb,
                  const rocblas_int strideB, const rocblas_int batch_count);
 
 
@@ -2427,7 +2427,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_strided_batched(
     positive definite matrix A.
 
     \details
-    (This is the unblocked version of the algorithm). 
+    (This is the unblocked version of the algorithm).
 
     The factorization has the form:
 
@@ -2453,8 +2453,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_zgetrs_strided_batched(
               specifies the leading dimension of A.
     @param[out]
     info      pointer to a rocsolver_int on the GPU.\n
-              If info = 0, succesful factorization of matrix A. 
-              If info = i > 0, the leading minor of order i of A is not positive definite. 
+              If info = 0, succesful factorization of matrix A.
+              If info = i > 0, the leading minor of order i of A is not positive definite.
               The factorization stopped at this point.
 
     ********************************************************************/
@@ -2472,11 +2472,11 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2(rocsolver_handle handle,
                                                    rocblas_int* info);
 
 
-/*! \brief POTF2_BATCHED computes the Cholesky factorization of a 
+/*! \brief POTF2_BATCHED computes the Cholesky factorization of a
     batch of real symmetric positive definite matrices.
 
     \details
-    (This is the unblocked version of the algorithm). 
+    (This is the unblocked version of the algorithm).
 
     The factorization of matrix A_i in the batch has the form:
 
@@ -2496,24 +2496,24 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2(rocsolver_handle handle,
               The dimension of matrix A_i.
     @param[inout]
     A         array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n
-              On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. 
+              On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors.
     @param[in]
     lda       rocsolver_int. lda >= n.\n
               specifies the leading dimension of A_i.
     @param[out]
     info      pointer to rocsolver_int. Array of batch_count integers on the GPU.\n
-              If info_i = 0, succesful factorization of matrix A_i. 
-              If info_i = j > 0, the leading minor of order j of A_i is not positive definite. 
+              If info_i = 0, succesful factorization of matrix A_i.
+              If info_i = j > 0, the leading minor of order j of A_i is not positive definite.
               The i-th factorization stopped at this point.
     @param[in]
     batch_count rocsolver_int. batch_count >= 0.\n
-                Number of matrices in the batch. 
+                Number of matrices in the batch.
 
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_spotf2_batched(rocsolver_handle handle,
                                                            const rocsolver_fill uplo,
-                                                           const rocsolver_int n, 
+                                                           const rocsolver_int n,
                                                            float *const A[],
                                                            const rocsolver_int lda,
                                                            rocblas_int* info,
@@ -2521,17 +2521,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotf2_batched(rocsolver_handle hand
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2_batched(rocsolver_handle handle,
                                                            const rocsolver_fill uplo,
-                                                           const rocsolver_int n, 
+                                                           const rocsolver_int n,
                                                            double *const A[],
                                                            const rocsolver_int lda,
                                                            rocblas_int* info,
                                                            const rocsolver_int batch_count);
 
-/*! \brief POTF2_STRIDED_BATCHED computes the Cholesky factorization of a 
+/*! \brief POTF2_STRIDED_BATCHED computes the Cholesky factorization of a
     batch of real symmetric positive definite matrices.
 
     \details
-    (This is the unblocked version of the algorithm). 
+    (This is the unblocked version of the algorithm).
 
     The factorization of matrix A_i in the batch has the form:
 
@@ -2551,28 +2551,28 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2_batched(rocsolver_handle hand
               The dimension of matrix A_i.
     @param[inout]
     A         pointer to type. Array on the GPU (the size depends on the value of strideA).\n
-              On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. 
+              On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors.
     @param[in]
     lda       rocsolver_int. lda >= n.\n
               specifies the leading dimension of A_i.
     @param[in]
     strideA   rocsolver_int.\n
-              Stride from the start of one matrix A_i and the next one A_(i+1). 
+              Stride from the start of one matrix A_i and the next one A_(i+1).
               There is no restriction for the value of strideA. Normal use case is strideA >= lda*n.
     @param[out]
     info      pointer to rocsolver_int. Array of batch_count integers on the GPU.\n
-              If info_i = 0, succesful factorization of matrix A_i. 
-              If info_i = j > 0, the leading minor of order j of A_i is not positive definite. 
+              If info_i = 0, succesful factorization of matrix A_i.
+              If info_i = j > 0, the leading minor of order j of A_i is not positive definite.
               The i-th factorization stopped at this point.
     @param[in]
     batch_count rocsolver_int. batch_count >= 0.\n
-                Number of matrices in the batch. 
+                Number of matrices in the batch.
 
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_spotf2_strided_batched(rocsolver_handle handle,
                                                                    const rocsolver_fill uplo,
-                                                                   const rocsolver_int n, 
+                                                                   const rocsolver_int n,
                                                                    float *A,
                                                                    const rocsolver_int lda,
                                                                    const rocsolver_int strideA,
@@ -2581,7 +2581,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotf2_strided_batched(rocsolver_han
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2_strided_batched(rocsolver_handle handle,
                                                                    const rocsolver_fill uplo,
-                                                                   const rocsolver_int n, 
+                                                                   const rocsolver_int n,
                                                                    double *A,
                                                                    const rocsolver_int lda,
                                                                    const rocsolver_int strideA,
@@ -2592,7 +2592,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2_strided_batched(rocsolver_han
     positive definite matrix A.
 
     \details
-    (This is the blocked version of the algorithm). 
+    (This is the blocked version of the algorithm).
 
     The factorization has the form:
 
@@ -2618,8 +2618,8 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotf2_strided_batched(rocsolver_han
               specifies the leading dimension of A.
     @param[out]
     info      pointer to a rocsolver_int on the GPU.\n
-              If info = 0, succesful factorization of matrix A. 
-              If info = i > 0, the leading minor of order i of A is not positive definite. 
+              If info = 0, succesful factorization of matrix A.
+              If info = i > 0, the leading minor of order i of A is not positive definite.
               The factorization stopped at this point.
 
     ********************************************************************/
@@ -2637,11 +2637,11 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotrf(rocsolver_handle handle,
                                                    rocblas_int* info);
 
 
-/*! \brief POTRF_BATCHED computes the Cholesky factorization of a 
+/*! \brief POTRF_BATCHED computes the Cholesky factorization of a
     batch of real symmetric positive definite matrices.
 
     \details
-    (This is the blocked version of the algorithm). 
+    (This is the blocked version of the algorithm).
 
     The factorization of matrix A_i in the batch has the form:
 
@@ -2661,24 +2661,24 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotrf(rocsolver_handle handle,
               The dimension of matrix A_i.
     @param[inout]
     A         array of pointers to type. Each pointer points to an array on the GPU of dimension lda*n.\n
-              On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. 
+              On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors.
     @param[in]
     lda       rocsolver_int. lda >= n.\n
               specifies the leading dimension of A_i.
     @param[out]
     info      pointer to rocsolver_int. Array of batch_count integers on the GPU.\n
-              If info_i = 0, succesful factorization of matrix A_i. 
-              If info_i = j > 0, the leading minor of order j of A_i is not positive definite. 
+              If info_i = 0, succesful factorization of matrix A_i.
+              If info_i = j > 0, the leading minor of order j of A_i is not positive definite.
               The i-th factorization stopped at this point.
     @param[in]
     batch_count rocsolver_int. batch_count >= 0.\n
-                Number of matrices in the batch. 
+                Number of matrices in the batch.
 
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_spotrf_batched(rocsolver_handle handle,
                                                            const rocsolver_fill uplo,
-                                                           const rocsolver_int n, 
+                                                           const rocsolver_int n,
                                                            float *const A[],
                                                            const rocsolver_int lda,
                                                            rocblas_int* info,
@@ -2686,17 +2686,17 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotrf_batched(rocsolver_handle hand
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotrf_batched(rocsolver_handle handle,
                                                            const rocsolver_fill uplo,
-                                                           const rocsolver_int n, 
+                                                           const rocsolver_int n,
                                                            double *const A[],
                                                            const rocsolver_int lda,
                                                            rocblas_int* info,
                                                            const rocsolver_int batch_count);
 
-/*! \brief POTRF_STRIDED_BATCHED computes the Cholesky factorization of a 
+/*! \brief POTRF_STRIDED_BATCHED computes the Cholesky factorization of a
     batch of real symmetric positive definite matrices.
 
     \details
-    (This is the blocked version of the algorithm). 
+    (This is the blocked version of the algorithm).
 
     The factorization of matrix A_i in the batch has the form:
 
@@ -2716,28 +2716,28 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotrf_batched(rocsolver_handle hand
               The dimension of matrix A_i.
     @param[inout]
     A         pointer to type. Array on the GPU (the size depends on the value of strideA).\n
-              On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors. 
+              On entry, the matrices A_i to be factored. On exit, the upper or lower triangular factors.
     @param[in]
     lda       rocsolver_int. lda >= n.\n
               specifies the leading dimension of A_i.
     @param[in]
     strideA   rocsolver_int.\n
-              Stride from the start of one matrix A_i and the next one A_(i+1). 
+              Stride from the start of one matrix A_i and the next one A_(i+1).
               There is no restriction for the value of strideA. Normal use case is strideA >= lda*n.
     @param[out]
     info      pointer to rocsolver_int. Array of batch_count integers on the GPU.\n
-              If info_i = 0, succesful factorization of matrix A_i. 
-              If info_i = j > 0, the leading minor of order j of A_i is not positive definite. 
+              If info_i = 0, succesful factorization of matrix A_i.
+              If info_i = j > 0, the leading minor of order j of A_i is not positive definite.
               The i-th factorization stopped at this point.
     @param[in]
     batch_count rocsolver_int. batch_count >= 0.\n
-                Number of matrices in the batch. 
+                Number of matrices in the batch.
 
     ********************************************************************/
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_spotrf_strided_batched(rocsolver_handle handle,
                                                                    const rocsolver_fill uplo,
-                                                                   const rocsolver_int n, 
+                                                                   const rocsolver_int n,
                                                                    float *A,
                                                                    const rocsolver_int lda,
                                                                    const rocsolver_int strideA,
@@ -2746,7 +2746,7 @@ ROCSOLVER_EXPORT rocsolver_status rocsolver_spotrf_strided_batched(rocsolver_han
 
 ROCSOLVER_EXPORT rocsolver_status rocsolver_dpotrf_strided_batched(rocsolver_handle handle,
                                                                    const rocsolver_fill uplo,
-                                                                   const rocsolver_int n, 
+                                                                   const rocsolver_int n,
                                                                    double *A,
                                                                    const rocsolver_int lda,
                                                                    const rocsolver_int strideA,
diff --git a/ROCm_Libraries/rocSOLVER/library/include/rocsolver-types.h b/ROCm_Libraries/rocSOLVER/library/include/rocsolver-types.h
index 55d3e42a..e8cf8251 100644
--- a/ROCm_Libraries/rocSOLVER/library/include/rocsolver-types.h
+++ b/ROCm_Libraries/rocSOLVER/library/include/rocsolver-types.h
@@ -11,8 +11,8 @@
 
 #include <rocblas.h>
 
-/*! \brief Used to specify int32 or int64. 
-    \details rocsolver_int is a rocblas_int 
+/*! \brief Used to specify int32 or int64.
+    \details rocsolver_int is a rocblas_int
  ******************************************************************/
 typedef rocblas_int rocsolver_int;
 
@@ -20,12 +20,12 @@ typedef rocblas_float_complex rocsolver_float_complex;
 typedef rocblas_double_complex rocsolver_double_complex;
 typedef rocblas_half rocsolver_half;
 
-/*! \brief A structure holding the rocsolver library context. 
-    \details 
+/*! \brief A structure holding the rocsolver library context.
+    \details
     It must be initialized using rocsolver_create_handle()
-    and the returned handle must be passed to all subsequent library 
+    and the returned handle must be passed to all subsequent library
     function calls. It should be destroyed at the end using rocsolver_destroy_handle().\n
-    rocsolver_handle is a rocblas_handle. 
+    rocsolver_handle is a rocblas_handle.
  *************************************************************************/
 typedef rocblas_handle rocsolver_handle;
 
@@ -56,16 +56,16 @@ typedef rocblas_status rocsolver_status;
 
 typedef rocblas_layer_mode rocsolver_layer_mode;
 
-/*! \brief Used to specify the order in which multiple elementary matrices are applied together 
- ********************************************************************************/ 
+/*! \brief Used to specify the order in which multiple elementary matrices are applied together
+ ********************************************************************************/
 typedef enum rocsolver_direct_
 {
     rocsolver_forward_direction = 171, /**< Elementary matrices applied from the right. */
     rocsolver_backward_direction = 172, /**< Elementary matrices applied from the left. */
 } rocsolver_direct;
 
-/*! \brief Used to specify how householder vectors are stored in a matrix of vectors 
- ********************************************************************************/ 
+/*! \brief Used to specify how householder vectors are stored in a matrix of vectors
+ ********************************************************************************/
 typedef enum rocsolver_storev_
 {
     rocsolver_column_wise = 181, /**< Householder vectors are stored in the columns of a matrix. */
diff --git a/ROCm_Libraries/rocSOLVER/library/src/CMakeLists.txt b/ROCm_Libraries/rocSOLVER/library/src/CMakeLists.txt
index cbf3d10d..4a435950 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/CMakeLists.txt
+++ b/ROCm_Libraries/rocSOLVER/library/src/CMakeLists.txt
@@ -82,7 +82,7 @@ add_library( rocsolver
   ${rocsolver_lapack_source}
   ${relative_rocsolver_headers_public}
   ${rocsolver_auxiliary_source}
-  ${rocsolver_common_source}  
+  ${rocsolver_common_source}
 )
 
 add_library( roc::rocsolver ALIAS rocsolver )
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larf.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larf.cpp
index 9c52fd62..8c4e0c70 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larf.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larf.cpp
@@ -5,7 +5,7 @@
 #include "rocauxiliary_larf.hpp"
 
 template <typename T>
-rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, 
+rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m,
                                    const rocsolver_int n, T* x, const rocsolver_int incx, const T* alpha,
                                    T* A, const rocsolver_int lda)
 {
@@ -24,7 +24,7 @@ rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side
     rocblas_int stridep = 0;
     rocblas_int batch_count=1;
 
-    return rocsolver_larf_template<T>(handle,side, 
+    return rocsolver_larf_template<T>(handle,side,
                                       m,n,
                                       x,0,    //vector shifted 0 entries
                                       incx,
@@ -33,7 +33,7 @@ rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side
                                       stridep,
                                       A,0,       //matrix shifted 0 entries
                                       lda,
-                                      stridea, 
+                                      stridea,
                                       batch_count);
 }
 
@@ -46,14 +46,14 @@ rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side
 
 extern "C" {
 
-ROCSOLVER_EXPORT rocblas_status rocsolver_slarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, 
+ROCSOLVER_EXPORT rocblas_status rocsolver_slarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m,
                                                 const rocsolver_int n, float* x, const rocsolver_int incx, const float* alpha,
                                                 float* A, const rocsolver_int lda)
 {
     return rocsolver_larf_impl<float>(handle, side, m, n, x, incx, alpha, A, lda);
 }
 
-ROCSOLVER_EXPORT rocblas_status rocsolver_dlarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, 
+ROCSOLVER_EXPORT rocblas_status rocsolver_dlarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m,
                                                 const rocsolver_int n, double* x, const rocsolver_int incx, const double* alpha,
                                                 double* A, const rocsolver_int lda)
 {
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larf.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larf.hpp
index 27a5a0d4..3755ea14 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larf.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larf.hpp
@@ -19,8 +19,8 @@
 
 template <typename T, typename U>
 rocblas_status rocsolver_larf_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m,
-                                        const rocsolver_int n, U x, const rocblas_int shiftx, const rocsolver_int incx, 
-                                        const rocblas_int stridex, const T* alpha, const rocblas_int stridep, U A, const rocblas_int shiftA, 
+                                        const rocsolver_int n, U x, const rocblas_int shiftx, const rocsolver_int incx,
+                                        const rocblas_int stridex, const T* alpha, const rocblas_int stridep, U A, const rocblas_int shiftA,
                                         const rocsolver_int lda, const rocblas_int stridea, const rocblas_int batch_count)
 {
     // quick return
@@ -40,7 +40,7 @@ rocblas_status rocsolver_larf_template(rocsolver_handle handle, const rocsolver_
     T* zeroInt;                 //constant 0 in device
     hipMalloc(&zeroInt, sizeof(T));
     hipMemcpy(zeroInt, &zero, sizeof(T), hipMemcpyHostToDevice);
-    
+
     #ifdef batched
         // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL
         //      BATCH-BLAS FUNCTIONALITY IS ENABLED. ****
@@ -66,16 +66,16 @@ rocblas_status rocsolver_larf_template(rocsolver_handle handle, const rocsolver_
     //      OF A AND X, AS THIS WOULD REQUIRE SYNCHRONIZATION WITH GPU.
     //      IT WILL WORK ON THE ENTIRE MATRIX/VECTOR REGARDLESS OF
     //      ZERO ENTRIES ****
- 
+
     //memory in GPU (workspace)
     T *workvec;
     hipMalloc(&workvec, sizeof(T)*order*batch_count);
 
-    
+
     // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS
     //      FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
-    
+
     //compute the matrix vector product  (W=tau*A'*X or W=tau*A*X)
     for (int b=0;b<batch_count;++b) {
         xp = load_ptr_batch<T>(xx,shiftx,b,stridex);
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfb.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfb.cpp
index 12ed4e92..d28b4a03 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfb.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfb.cpp
@@ -5,10 +5,10 @@
 #include "rocauxiliary_larfb.hpp"
 
 template <typename T>
-rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_side side, 
-                                    const rocsolver_operation trans, const rocsolver_direct direct, 
+rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_side side,
+                                    const rocsolver_operation trans, const rocsolver_direct direct,
                                     const rocsolver_storev storev,
-                                    const rocsolver_int m, const rocsolver_int n, 
+                                    const rocsolver_int m, const rocsolver_int n,
                                     const rocsolver_int k, T* V, const rocsolver_int ldv, T* F, const rocsolver_int ldf,
                                     T* A, const rocsolver_int lda)
 {
@@ -22,7 +22,7 @@ rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_sid
     if (storev == rocsolver_row_wise) {
         if (ldv < k)
             return rocblas_status_invalid_size;
-    } else {    
+    } else {
         if ((side == rocblas_side_left && ldv < m) || (side == rocblas_side_right && ldv < n))
             return rocblas_status_invalid_size;
     }
@@ -34,7 +34,7 @@ rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_sid
     rocblas_int stridef = 0;
     rocblas_int batch_count=1;
 
-    return rocsolver_larfb_template<T>(handle,side,trans,direct,storev, 
+    return rocsolver_larfb_template<T>(handle,side,trans,direct,storev,
                                       m,n,k,
                                       V,0,      //shifted 0 entries
                                       ldv,
@@ -44,7 +44,7 @@ rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_sid
                                       stridef,
                                       A,0,      //shifted 0 entries
                                       lda,
-                                      stridea, 
+                                      stridea,
                                       batch_count);
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfb.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfb.hpp
index 5214e29a..dc4ee469 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfb.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfb.hpp
@@ -19,7 +19,7 @@
 
 
 template <typename T, typename U>
-__global__ void copymatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work) 
+__global__ void copymatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work)
 {
     const auto blocksizex = hipBlockDim_x;
     const auto blocksizey = hipBlockDim_y;
@@ -38,7 +38,7 @@ __global__ void copymatA1(const rocsolver_int ldw, const rocsolver_int order, U
 }
 
 template <typename T, typename U>
-__global__ void addmatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work) 
+__global__ void addmatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work)
 {
     const auto blocksizex = hipBlockDim_x;
     const auto blocksizey = hipBlockDim_y;
@@ -52,18 +52,18 @@ __global__ void addmatA1(const rocsolver_int ldw, const rocsolver_int order, U A
         Wp = work + b*strideW;
         Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
 
-        Ap[i + j*lda] -= Wp[i + j*ldw];    
+        Ap[i + j*lda] -= Wp[i + j*ldw];
     }
 }
 
 template <typename T, typename U>
-rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver_side side, 
-                                        const rocsolver_operation trans, const rocsolver_direct direct, 
+rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver_side side,
+                                        const rocsolver_operation trans, const rocsolver_direct direct,
                                         const rocsolver_storev storev,
                                         const rocsolver_int m, const rocsolver_int n,
-                                        const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv, 
+                                        const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv,
                                         const rocsolver_int strideV, T *F, const rocsolver_int shiftF,
-                                        const rocsolver_int ldf, const rocsolver_int strideF, 
+                                        const rocsolver_int ldf, const rocsolver_int strideF,
                                         U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA,
                                         const rocsolver_int batch_count)
 {
@@ -100,14 +100,14 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
 
     //determine the side, size of workspace
     //and whether V is trapezoidal
-    rocsolver_operation transp; 
+    rocsolver_operation transp;
     rocsolver_fill uploV;
     bool trap;
     rocblas_int order, ldw;
-    bool colwise = (storev == rocsolver_column_wise); 
+    bool colwise = (storev == rocsolver_column_wise);
     bool leftside = (side == rocblas_side_left);
     size_t offsetV;
-    
+
     if (leftside) {
         order = n;
         ldw = k;
@@ -120,16 +120,16 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
     if (colwise) {
         uploV = rocblas_fill_lower;
         offsetV = idx2D(k,0,ldv);
-        if (leftside) 
+        if (leftside)
             transp = rocblas_operation_transpose;
-        else 
+        else
             transp = rocblas_operation_none;
     } else {
         uploV = rocblas_fill_upper;
         offsetV = idx2D(0,k,ldv);
-        if (leftside) 
+        if (leftside)
             transp = rocblas_operation_none;
-        else 
+        else
             transp = rocblas_operation_transpose;
     }
 
@@ -146,15 +146,15 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
     rocblas_int blocksx = (order - 1)/32 + 1;
     rocblas_int blocksy = (ldw - 1)/32 + 1;
     hipLaunchKernelGGL(copymatA1,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,ldw,order,A,shiftA,lda,strideA,work);
-    
+
     // BACKWARD DIRECTION TO BE IMPLEMENTED...
     rocsolver_fill uploT = rocblas_fill_upper;
     if (direct == rocsolver_backward_direction)
         return rocblas_status_not_implemented;
-    
+
     //compute:
     // V1' * A1, or
-    //   or 
+    //   or
     // A1 * V1
     for (int b=0;b<batch_count;++b) {
         Vp = load_ptr_batch<T>(VV,shiftV,b,strideV);
@@ -162,14 +162,14 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
     }
 
     // compute:
-    // V1' * A1 + V2' * A2 
-    //        or 
+    // V1' * A1 + V2' * A2
+    //        or
     // A1 * V1 + A2 * V2
-    if (trap) { 
+    if (trap) {
         for (int b=0;b<batch_count;++b) {
             Ap = load_ptr_batch<T>(AA,shiftA,b,strideA);
             Vp = load_ptr_batch<T>(VV,shiftV,b,strideV);
-            if (leftside) { 
+            if (leftside) {
                 rocblas_gemm(handle,transp,rocblas_operation_none,ldw,order,m-k,oneInt,
                              (Vp + offsetV),ldv,
                              (Ap + idx2D(k,0,lda)),lda,
@@ -183,10 +183,10 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
         }
     }
 
-    // compute: 
+    // compute:
     // trans(T) * (V1' * A1 + V2' * A2)
     //              or
-    // (A1 * V1 + A2 * V2) * trans(T)    
+    // (A1 * V1 + A2 * V2) * trans(T)
     for (int b=0;b<batch_count;++b) {
         Fp = load_ptr_batch<T>(FF,shiftF,b,strideF);
         rocblas_trmm(handle,side,uploT,trans,rocblas_diagonal_non_unit,ldw,order,oneInt,Fp,ldf,(work + b*strideW),ldw);
@@ -195,7 +195,7 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
     // compute:
     // A2 - V2 * trans(T) * (V1' * A1 + V2' * A2)
     //              or
-    // A2 - (A1 * V1 + A2 * V2) * trans(T) * V2'    
+    // A2 - (A1 * V1 + A2 * V2) * trans(T) * V2'
     if (transp == rocblas_operation_transpose)
         transp = rocblas_operation_none;
     else
@@ -205,7 +205,7 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
         for (int b=0;b<batch_count;++b) {
             Ap = load_ptr_batch<T>(AA,shiftA,b,strideA);
             Vp = load_ptr_batch<T>(VV,shiftV,b,strideV);
-            if (leftside) { 
+            if (leftside) {
                 rocblas_gemm(handle,transp,rocblas_operation_none,m-k,order,ldw,minoneInt,
                              (Vp + offsetV),ldv,
                              (work + b*strideW),ldw,
@@ -218,22 +218,22 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
             }
         }
     }
-        
+
     // compute:
     // V1 * trans(T) * (V1' * A1 + V2' * A2)
     //              or
-    // (A1 * V1 + A2 * V2) * trans(T) * V1'    
+    // (A1 * V1 + A2 * V2) * trans(T) * V1'
     for (int b=0;b<batch_count;++b) {
         Vp = load_ptr_batch<T>(VV,shiftV,b,strideV);
         rocblas_trmm(handle,side,uploV,transp,rocblas_diagonal_unit,ldw,order,oneInt,Vp,ldv,(work + b*strideW),ldw);
     }
-    
+
     // compute:
     // A1 - V1 * trans(T) * (V1' * A1 + V2' * A2)
     //              or
     // A1 - (A1 * V1 + A2 * V2) * trans(T) * V1'
     hipLaunchKernelGGL(addmatA1,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,ldw,order,A,shiftA,lda,strideA,work);
-    
+
     hipFree(minoneInt);
     hipFree(oneInt);
     hipFree(work);
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfg.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfg.cpp
index 4b1e00fa..8e651066 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfg.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfg.cpp
@@ -26,7 +26,7 @@ rocblas_status rocsolver_larfg_impl(rocblas_handle handle, const rocblas_int n,
                                         incx,
                                         stridex,
                                         tau,
-                                        strideP, 
+                                        strideP,
                                         batch_count);
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfg.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfg.hpp
index f4fc193c..38683f5d 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfg.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larfg.hpp
@@ -42,7 +42,7 @@ __global__ void set_taubeta(T *tau, const rocblas_int strideP, T *norms, U alpha
 
 
 template <typename T, typename U>
-rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int n, U alpha, const rocblas_int shifta, 
+rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int n, U alpha, const rocblas_int shifta,
                                         U x, const rocblas_int shiftx, const rocblas_int incx, const rocblas_int stridex,
                                         T *tau, const rocblas_int strideP, const rocblas_int batch_count)
 {
@@ -54,11 +54,11 @@ rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
     dim3 gridReset(1, batch_count, 1);
-    dim3 threads(1, 1, 1); 
+    dim3 threads(1, 1, 1);
     if (n == 1) {
         hipLaunchKernelGGL(reset_batch_info,gridReset,threads,0,stream,tau,strideP,1,0);
-        return rocblas_status_success;    
-    } 
+        return rocblas_status_success;
+    }
 
     T *xp;
 
@@ -73,12 +73,12 @@ rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int
 
     //memory in GPU (workspace)
     T *norms;
-    hipMalloc(&norms, sizeof(T)*batch_count);    
+    hipMalloc(&norms, sizeof(T)*batch_count);
 
     // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS
     //      FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
-    
+
     //compute norm of x
     for (int b=0;b<batch_count;++b) {
         xp = load_ptr_batch<T>(xx,shiftx,b,stridex);
@@ -87,9 +87,9 @@ rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int
 
     //set value of tau and beta and scalling factor for vector x
     //alpha <- beta
-    //norms <- scalling   
+    //norms <- scalling
     hipLaunchKernelGGL(set_taubeta<T>,dim3(batch_count),dim3(1),0,stream,tau,strideP,norms,alpha,shifta,stridex);
-     
+
     //compute vector v=x*norms
     for (int b=0;b<batch_count;++b) {
         xp = load_ptr_batch<T>(xx,shiftx,b,stridex);
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larft.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larft.cpp
index 5ab79a92..10915015 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larft.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larft.cpp
@@ -5,8 +5,8 @@
 #include "rocauxiliary_larft.hpp"
 
 template <typename T>
-rocblas_status rocsolver_larft_impl(rocsolver_handle handle, const rocsolver_direct direct, 
-                                   const rocsolver_storev storev, const rocsolver_int n, 
+rocblas_status rocsolver_larft_impl(rocsolver_handle handle, const rocsolver_direct direct,
+                                   const rocsolver_storev storev, const rocsolver_int n,
                                    const rocsolver_int k, T* V, const rocsolver_int ldv, T* tau,
                                    T* F, const rocsolver_int ldf)
 {
@@ -38,7 +38,7 @@ rocblas_status rocsolver_larft_impl(rocsolver_handle handle, const rocsolver_dir
                                       stridet,
                                       F,
                                       ldf,
-                                      stridef, 
+                                      stridef,
                                       batch_count);
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larft.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larft.hpp
index ee2add09..8a38ac3f 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larft.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_larft.hpp
@@ -17,8 +17,8 @@
 #include "common_device.hpp"
 
 template <typename T, typename U>
-__global__ void set_triangular(const rocsolver_int k, U V, const rocsolver_int shiftV, const rocsolver_int ldv, const rocsolver_int strideV, 
-                         T* tau, const rocsolver_int strideT, 
+__global__ void set_triangular(const rocsolver_int k, U V, const rocsolver_int shiftV, const rocsolver_int ldv, const rocsolver_int strideV,
+                         T* tau, const rocsolver_int strideT,
                          T* F, const rocsolver_int ldf, const rocsolver_int strideF, const rocsolver_storev storev)
 {
     const auto blocksize = hipBlockDim_x;
@@ -51,20 +51,20 @@ __global__ void set_tau(const rocsolver_int k, T* tau, const rocsolver_int strid
     const auto blocksize = hipBlockDim_x;
     const auto b = hipBlockIdx_x;
     const auto i = hipBlockIdx_y * blocksize + hipThreadIdx_x;
-   
+
     if (i < k) {
         T *tp;
         tp = tau + b*strideT;
         tp[i] = -tp[i];
     }
 }
-         
+
 
 template <typename T, typename U>
-rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver_direct direct, 
+rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver_direct direct,
                                    const rocsolver_storev storev, const rocsolver_int n,
-                                   const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv, 
-                                   const rocsolver_int strideV, T* tau, const rocsolver_int strideT, T* F, 
+                                   const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv,
+                                   const rocsolver_int strideV, T* tau, const rocsolver_int strideT, T* F,
                                    const rocsolver_int ldf, const rocsolver_int strideF, const rocsolver_int batch_count)
 {
     // quick return
@@ -84,7 +84,7 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver
     hipMemcpy(oneInt, &one, sizeof(T), hipMemcpyHostToDevice);
     hipMalloc(&zeroInt, sizeof(T));
     hipMemcpy(zeroInt, &zero, sizeof(T), hipMemcpyHostToDevice);
-    
+
     #ifdef batched
         // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL
         //      BATCH-BLAS FUNCTIONALITY IS ENABLED. ****
@@ -98,26 +98,26 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver
     if (direct == rocsolver_backward_direction)
         return rocblas_status_not_implemented;
 
-    //Fix diagonal of T, make zero the non used triangular part, 
+    //Fix diagonal of T, make zero the non used triangular part,
     //setup tau (changing signs) and account for the non-stored 1's on the householder vectors
     rocblas_int blocks = (k - 1)/32 + 1;
     hipLaunchKernelGGL(set_triangular,dim3(blocks,blocks,batch_count),dim3(32,32),0,stream,
                         k,V,shiftV,ldv,strideV,tau,strideT,F,ldf,strideF,storev);
     hipLaunchKernelGGL(set_tau,dim3(batch_count,blocks),dim3(32,1),0,stream,k,tau,strideT);
 
-    // **** FOR NOW, IT DOES NOT LOOK FOR TRAILING ZEROS 
+    // **** FOR NOW, IT DOES NOT LOOK FOR TRAILING ZEROS
     //      AS THIS WOULD REQUIRE SYNCHRONIZATION WITH GPU.
     //      IT WILL WORK ON THE ENTIRE MATRIX/VECTOR REGARDLESS OF
     //      ZERO ENTRIES ****
- 
+
     // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS
     //      FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
-    
-    rocblas_operation trans;  
 
-    
-    for (int i = 1; i < k; ++i) { 
+    rocblas_operation trans;
+
+
+    for (int i = 1; i < k; ++i) {
         //compute the matrix vector product, using the householder vectors
         for (int b=0;b<batch_count;++b) {
             tp = tau + b*strideT;
@@ -137,13 +137,13 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver
         //multiply by the previous triangular factor
         //THIS SHOULD BE DONE USING TRMV ONCE THIS
         //FUNCTIONALITY IS AVAILABLE IN ROCBLAS
-        trans = rocblas_operation_none; 
+        trans = rocblas_operation_none;
         for (int b=0;b<batch_count;++b) {
             Vp = load_ptr_batch<T>(VV,shiftV,b,strideV);
             Fp = F + b*strideF;
-            rocblas_gemv(handle, trans, i, i, oneInt, Fp, ldf, 
+            rocblas_gemv(handle, trans, i, i, oneInt, Fp, ldf,
                         (Fp + idx2D(0,i,ldf)), 1, zeroInt, (Fp + idx2D(0,i,ldf)), 1);
-        } 
+        }
     }
 
     //restore tau
@@ -151,7 +151,7 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver
 
     hipFree(oneInt);
     hipFree(zeroInt);
-    
+
     return rocblas_status_success;
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_laswp.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_laswp.cpp
index e79f652f..360fef79 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_laswp.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_laswp.cpp
@@ -54,14 +54,14 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dlaswp(rocsolver_handle handle, const
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_claswp(rocsolver_handle handle, const rocsolver_int n,
-                 rocblas_float_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2, 
+                 rocblas_float_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2,
                  const rocsolver_int *ipiv, const rocblas_int incx)
 {
     return rocsolver_laswp_impl<rocblas_float_complex>(handle, n, A, lda, k1, k2, ipiv, incx);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zlaswp(rocsolver_handle handle, const rocsolver_int n,
-                 rocblas_double_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2, 
+                 rocblas_double_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2,
                  const rocsolver_int *ipiv, const rocblas_int incx)
 {
     return rocsolver_laswp_impl<rocblas_double_complex>(handle, n, A, lda, k1, k2, ipiv, incx);
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_laswp.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_laswp.hpp
index 0dc74205..4615a7ec 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_laswp.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_laswp.hpp
@@ -51,10 +51,10 @@ __global__ void laswp_kernel(const rocblas_int n, U AA, const rocblas_int shiftA
 template <typename T, typename U>
 rocblas_status rocsolver_laswp_template(rocblas_handle handle, const rocblas_int n, U A, const rocblas_int shiftA,
                               const rocblas_int lda, const rocblas_int strideA, const rocblas_int k1, const rocblas_int k2,
-                              const rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int incx, 
+                              const rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int incx,
                               const rocblas_int batch_count) {
     // quick return
-    if (n == 0 || !batch_count) 
+    if (n == 0 || !batch_count)
         return rocblas_status_success;
 
     rocblas_int start, end, inc;
@@ -63,7 +63,7 @@ rocblas_status rocsolver_laswp_template(rocblas_handle handle, const rocblas_int
         end = k1 - 1;
         inc = -1;
         incx = -incx;
-    } 
+    }
     else {
         start = k1;
         end = k2 + 1;
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_org2r.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_org2r.cpp
index 102fd83e..465b3635 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_org2r.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_org2r.cpp
@@ -5,7 +5,7 @@
 #include "rocauxiliary_org2r.hpp"
 
 template <typename T>
-rocblas_status rocsolver_org2r_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_org2r_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv)
 {
     if(!handle)
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_org2r.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_org2r.hpp
index 08d072aa..2dbcc11e 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_org2r.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_org2r.hpp
@@ -29,10 +29,10 @@ __global__ void init_ident_col(const rocblas_int m, const rocblas_int n, const r
 
     if (i < m && j < n) {
         T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
-        
-        if (i == j) 
+
+        if (i == j)
             Ap[i + j*lda] = 1.0;
-        else if (j > i) 
+        else if (j > i)
             Ap[i + j*lda] = 0.0;
         else if (j >= k)
             Ap[i + j*lda] = 0.0;
@@ -40,9 +40,9 @@ __global__ void init_ident_col(const rocblas_int m, const rocblas_int n, const r
 }
 
 template <typename T, typename U>
-rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver_int m, 
-                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, 
-                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver_int m,
+                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA,
+                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, const rocsolver_int batch_count)
 {
     // quick return
@@ -51,7 +51,7 @@ rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver
 
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
-    
+
     #ifdef batched
         // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL
         //      BATCH-BLAS FUNCTIONALITY IS ENABLED. ****
@@ -64,7 +64,7 @@ rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver
     // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS
     //      FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
-    
+
     T* M;
 
     // Initialize identity matrix (non used columns)
@@ -78,34 +78,34 @@ rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver
         if (j < n - 1) {
             rocsolver_larf_template(handle,rocblas_side_left,           //side
                                     m - j,                              //number of rows of matrix to modify
-                                    n - j - 1,                          //number of columns of matrix to modify    
+                                    n - j - 1,                          //number of columns of matrix to modify
                                     A, shiftA + idx2D(j,j,lda),         //householder vector x
                                     1, strideA,                         //inc of x
                                     (ipiv + j), strideP,                //householder scalar (alpha)
                                     A, shiftA + idx2D(j,j+1,lda),       //matrix to work on
                                     lda, strideA,                       //leading dimension
-                                    batch_count);          
+                                    batch_count);
         }
 
         // set the diagonal element and negative tau
         hipLaunchKernelGGL(setdiag<T>,dim3(batch_count),dim3(1),0,stream,
                             j,A,shiftA,lda,strideA,ipiv,strideP);
-        
+
         // update i-th column -corresponding to H(i)-
         if (j < m - 1) {
             for (int b=0;b<batch_count;++b) {
                 M = load_ptr_batch<T>(AA,shiftA,b,strideA);
-                rocblas_scal(handle, (m-j-1), (ipiv + b*strideP + j), 
-                            (M + idx2D(j + 1, j, lda)), 1); 
-            }          
+                rocblas_scal(handle, (m-j-1), (ipiv + b*strideP + j),
+                            (M + idx2D(j + 1, j, lda)), 1);
+            }
         }
     }
-    
+
     // restore values of tau
     blocksx = (k - 1)/128 + 1;
     hipLaunchKernelGGL(restau<T>,dim3(blocksx,batch_count),dim3(128),0,stream,
                             k,ipiv,strideP);
- 
+
     return rocblas_status_success;
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgbr.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgbr.cpp
index bd3e4714..eb4f0bb6 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgbr.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgbr.cpp
@@ -5,8 +5,8 @@
 #include "rocauxiliary_orgbr.hpp"
 
 template <typename T>
-rocblas_status rocsolver_orgbr_impl(rocsolver_handle handle, const rocsolver_storev storev, 
-                                   const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_orgbr_impl(rocsolver_handle handle, const rocsolver_storev storev,
+                                   const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv)
 {
     if(!handle)
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgbr.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgbr.hpp
index a1315b6e..deec30a8 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgbr.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgbr.hpp
@@ -23,7 +23,7 @@
 #define BS 32 //blocksize for kernels
 
 template <typename T, typename U>
-__global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, 
+__global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA,
                          T *W, const rocsolver_int shiftW, const rocsolver_int ldw, const rocsolver_int strideW)
 {
     const auto b = hipBlockIdx_z;
@@ -33,17 +33,17 @@ __global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const
     if (i < dim && j < dim && j <= i) {
         rocblas_int offset = j*(j+1)/2; //to acommodate in smaller array W
 
-        T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);    
+        T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
         T *Wp = load_ptr_batch<T>(W,shiftW,b,strideW);
-        
+
         if (copy) {
             //copy columns
-            Wp[i + j*ldw - offset] = (j == 0 ? 0.0 : Ap[i+1 + (j-1)*lda]);    
-        
+            Wp[i + j*ldw - offset] = (j == 0 ? 0.0 : Ap[i+1 + (j-1)*lda]);
+
         } else {
-            // shift columns to the right   
+            // shift columns to the right
             Ap[i+1 + j*lda] = Wp[i + j*ldw - offset];
-            
+
             // make first row the identity
             if (i == j) {
                 Ap[(j+1)*lda] = 0.0;
@@ -55,7 +55,7 @@ __global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const
 }
 
 template <typename T, typename U>
-__global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, 
+__global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA,
                          T *W, const rocsolver_int shiftW, const rocsolver_int ldw, const rocsolver_int strideW)
 {
     const auto b = hipBlockIdx_z;
@@ -65,17 +65,17 @@ __global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const
     if (i < dim && j < dim && i <= j) {
         rocblas_int offset = j*ldw - j*(j+1)/2; //to acommodate in smaller array W
 
-        T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);    
+        T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
         T *Wp = load_ptr_batch<T>(W,shiftW,b,strideW);
-        
+
         if (copy) {
             //copy rows
-            Wp[i + j*ldw - offset] = (i == 0 ? 0.0 : Ap[i-1 + (j+1)*lda]);    
-        
+            Wp[i + j*ldw - offset] = (i == 0 ? 0.0 : Ap[i-1 + (j+1)*lda]);
+
         } else {
-            // shift rows downward   
+            // shift rows downward
             Ap[i + (j+1)*lda] = Wp[i + j*ldw - offset];
-            
+
             // make first column the identity
             if (i == j) {
                 Ap[i+1] = 0.0;
@@ -87,9 +87,9 @@ __global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const
 }
 
 template <typename T, typename U>
-rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver_storev storev, const rocsolver_int m, 
-                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, 
-                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver_storev storev, const rocsolver_int m,
+                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA,
+                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, const rocsolver_int batch_count)
 {
     // quick return
@@ -99,11 +99,11 @@ rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
 
-    // if column-wise, compute orthonormal columns of matrix Q in the bi-diagonalization 
+    // if column-wise, compute orthonormal columns of matrix Q in the bi-diagonalization
     // of a m-by-k matrix A (given by gebrd)
     if (storev == rocsolver_column_wise) {
         if (m >= k) {
-            rocsolver_orgqr_template<T>(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count);    
+            rocsolver_orgqr_template<T>(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count);
         } else {
             // shift the householder vectors provided by gebrd as they come below the first subdiagonal
             // workspace
@@ -115,21 +115,21 @@ rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver
             rocblas_int blocks = (m - 2)/BS + 1;
 
             // copy
-            hipLaunchKernelGGL(copyshift_col<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, 
-                                true,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW);           
+            hipLaunchKernelGGL(copyshift_col<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream,
+                                true,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW);
 
             // shift
-            hipLaunchKernelGGL(copyshift_col<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, 
-                                false,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW);           
-            
+            hipLaunchKernelGGL(copyshift_col<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream,
+                                false,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW);
+
             // result
-            rocsolver_orgqr_template<T>(handle, m-1, m-1, m-1, A, shiftA + idx2D(1,1,lda), lda, strideA, ipiv, strideP, batch_count);    
-        
+            rocsolver_orgqr_template<T>(handle, m-1, m-1, m-1, A, shiftA + idx2D(1,1,lda), lda, strideA, ipiv, strideP, batch_count);
+
             hipFree(W);
-        }   
+        }
     }
-    
-    // if row-wise, compute orthonormal rowss of matrix P' in the bi-diagonalization 
+
+    // if row-wise, compute orthonormal rowss of matrix P' in the bi-diagonalization
     // of a k-by-n matrix A (given by gebrd)
     else {
         if (n > k) {
@@ -145,19 +145,19 @@ rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver
             rocblas_int blocks = (n - 2)/BS + 1;
 
             // copy
-            hipLaunchKernelGGL(copyshift_row<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, 
-                                true,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW);           
+            hipLaunchKernelGGL(copyshift_row<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream,
+                                true,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW);
 
             // shift
-            hipLaunchKernelGGL(copyshift_row<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, 
-                                false,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW);           
+            hipLaunchKernelGGL(copyshift_row<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream,
+                                false,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW);
 
             // result
             rocsolver_orglq_template<T>(handle, n-1, n-1, n-1, A, shiftA + idx2D(1,1,lda), lda, strideA, ipiv, strideP, batch_count);
-                
+
             hipFree(W);
         }
-    }    
+    }
 
     return rocblas_status_success;
 }
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgl2.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgl2.cpp
index 27e3d8ed..ec38dc16 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgl2.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgl2.cpp
@@ -5,7 +5,7 @@
 #include "rocauxiliary_orgl2.hpp"
 
 template <typename T>
-rocblas_status rocsolver_orgl2_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_orgl2_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv)
 {
     if(!handle)
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgl2.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgl2.hpp
index 202a4fc3..35475070 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgl2.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgl2.hpp
@@ -29,10 +29,10 @@ __global__ void init_ident_row(const rocblas_int m, const rocblas_int n, const r
 
     if (i < m && j < n) {
         T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
-        
-        if (i == j) 
+
+        if (i == j)
             Ap[i + j*lda] = 1.0;
-        else if (j < i) 
+        else if (j < i)
             Ap[i + j*lda] = 0.0;
         else if (i >= k)
             Ap[i + j*lda] = 0.0;
@@ -40,9 +40,9 @@ __global__ void init_ident_row(const rocblas_int m, const rocblas_int n, const r
 }
 
 template <typename T, typename U>
-rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver_int m, 
-                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, 
-                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver_int m,
+                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA,
+                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, const rocsolver_int batch_count)
 {
     // quick return
@@ -51,7 +51,7 @@ rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver
 
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
-    
+
     #ifdef batched
         // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL
         //      BATCH-BLAS FUNCTIONALITY IS ENABLED. ****
@@ -64,7 +64,7 @@ rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver
     // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS
     //      FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
-    
+
     T* M;
 
     // Initialize identity matrix (non used columns)
@@ -78,34 +78,34 @@ rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver
         if (j < m - 1) {
             rocsolver_larf_template(handle,rocblas_side_right,          //side
                                     m - j - 1,                          //number of rows of matrix to modify
-                                    n - j,                              //number of columns of matrix to modify    
+                                    n - j,                              //number of columns of matrix to modify
                                     A, shiftA + idx2D(j,j,lda),         //householder vector x
                                     lda, strideA,                       //inc of x
                                     (ipiv + j), strideP,                //householder scalar (alpha)
                                     A, shiftA + idx2D(j+1,j,lda),       //matrix to work on
                                     lda, strideA,                       //leading dimension
-                                    batch_count);          
+                                    batch_count);
         }
 
         // set the diagonal element and negative tau
         hipLaunchKernelGGL(setdiag<T>,dim3(batch_count),dim3(1),0,stream,
                             j,A,shiftA,lda,strideA,ipiv,strideP);
-        
+
         // update i-th row -corresponding to H(i)-
         if (j < n - 1) {
             for (int b=0;b<batch_count;++b) {
                 M = load_ptr_batch<T>(AA,shiftA,b,strideA);
-                rocblas_scal(handle, (n-j-1), (ipiv + b*strideP + j), 
-                            (M + idx2D(j, j + 1, lda)), lda); 
-            }          
+                rocblas_scal(handle, (n-j-1), (ipiv + b*strideP + j),
+                            (M + idx2D(j, j + 1, lda)), lda);
+            }
         }
     }
-    
+
     // restore values of tau
     blocksx = (k - 1)/128 + 1;
     hipLaunchKernelGGL(restau<T>,dim3(blocksx,batch_count),dim3(128),0,stream,
                             k,ipiv,strideP);
- 
+
     return rocblas_status_success;
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orglq.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orglq.cpp
index 35b17482..e3039734 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orglq.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orglq.cpp
@@ -5,7 +5,7 @@
 #include "rocauxiliary_orglq.hpp"
 
 template <typename T>
-rocblas_status rocsolver_orglq_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_orglq_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv)
 {
     if(!handle)
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orglq.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orglq.hpp
index 97886fce..39f77a46 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orglq.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orglq.hpp
@@ -32,16 +32,16 @@ __global__ void set_zero_row(const rocblas_int m, const rocblas_int kk, U A,
 
     if (i < m && j < kk) {
         T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
-        
+
         Ap[i + j*lda] = 0.0;
     }
 }
 
 
 template <typename T, typename U>
-rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver_int m, 
-                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, 
-                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver_int m,
+                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA,
+                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, const rocsolver_int batch_count)
 {
     // quick return
@@ -50,9 +50,9 @@ rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver
 
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
-    
+
     // if the matrix is small, use the unblocked variant of the algorithm
-    if (k <= GEQRF_GEQR2_SWITCHSIZE) 
+    if (k <= GEQRF_GEQR2_SWITCHSIZE)
         return rocsolver_orgl2_template<T>(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count);
 
     //memory in GPU (workspace)
@@ -64,34 +64,34 @@ rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver
     // start of first blocked block
     rocblas_int jb = GEQRF_GEQR2_BLOCKSIZE;
     rocblas_int j = ((k - GEQRF_GEQR2_SWITCHSIZE - 1) / jb) * jb;
-    
+
     // start of the unblocked block
-    rocblas_int kk = min(k, j + jb); 
+    rocblas_int kk = min(k, j + jb);
 
     rocblas_int blocksy, blocksx;
-    
-    // compute the unblockled part and set to zero the 
+
+    // compute the unblockled part and set to zero the
     // corresponding left submatrix
     if (kk < m) {
         blocksx = (m - kk - 1)/32 + 1;
         blocksy = (kk - 1)/32 + 1;
         hipLaunchKernelGGL(set_zero_row<T>,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,
                            m,kk,A,shiftA,lda,strideA);
-        
-        rocsolver_orgl2_template<T>(handle, m - kk, n - kk, k - kk, 
-                                    A, shiftA + idx2D(kk, kk, lda), lda, 
+
+        rocsolver_orgl2_template<T>(handle, m - kk, n - kk, k - kk,
+                                    A, shiftA + idx2D(kk, kk, lda), lda,
                                     strideA, (ipiv + kk), strideP, batch_count);
     }
 
     // compute the blocked part
     while (j >= 0) {
-        
+
         // first update the already computed part
         // applying the current block reflector using larft + larfb
         if (j + jb < m) {
-            rocsolver_larft_template<T>(handle, rocsolver_forward_direction, 
-                                        rocsolver_row_wise, n-j, jb, 
-                                        A, shiftA + idx2D(j,j,lda), lda, strideA, 
+            rocsolver_larft_template<T>(handle, rocsolver_forward_direction,
+                                        rocsolver_row_wise, n-j, jb,
+                                        A, shiftA + idx2D(j,j,lda), lda, strideA,
                                         (ipiv + j), strideP,
                                         work, ldw, strideW, batch_count);
 
@@ -110,13 +110,13 @@ rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver
             hipLaunchKernelGGL(set_zero_row<T>,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,
                                j+jb,j,A,shiftA,lda,strideA);
         }
-        rocsolver_orgl2_template<T>(handle, jb, n - j, jb, 
-                                    A, shiftA + idx2D(j, j, lda), lda, 
+        rocsolver_orgl2_template<T>(handle, jb, n - j, jb,
+                                    A, shiftA + idx2D(j, j, lda), lda,
                                     strideA, (ipiv + j), strideP, batch_count);
 
         j -= jb;
     }
- 
+
     hipFree(work);
 
     return rocblas_status_success;
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgqr.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgqr.cpp
index ef11bd5e..7b1aceec 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgqr.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgqr.cpp
@@ -5,7 +5,7 @@
 #include "rocauxiliary_orgqr.hpp"
 
 template <typename T>
-rocblas_status rocsolver_orgqr_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_orgqr_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv)
 {
     if(!handle)
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgqr.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgqr.hpp
index 86386317..8079413c 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgqr.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orgqr.hpp
@@ -32,15 +32,15 @@ __global__ void set_zero_col(const rocblas_int n, const rocblas_int kk, U A,
 
     if (i < kk && j < n) {
         T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
-        
+
         Ap[i + j*lda] = 0.0;
     }
 }
 
 template <typename T, typename U>
-rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver_int m, 
-                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, 
-                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver_int m,
+                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA,
+                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, const rocsolver_int batch_count)
 {
     // quick return
@@ -49,9 +49,9 @@ rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver
 
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
-    
+
     // if the matrix is small, use the unblocked variant of the algorithm
-    if (k <= GEQRF_GEQR2_SWITCHSIZE) 
+    if (k <= GEQRF_GEQR2_SWITCHSIZE)
         return rocsolver_org2r_template<T>(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count);
 
     //memory in GPU (workspace)
@@ -63,34 +63,34 @@ rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver
     // start of first blocked block
     rocblas_int jb = GEQRF_GEQR2_BLOCKSIZE;
     rocblas_int j = ((k - GEQRF_GEQR2_SWITCHSIZE - 1) / jb) * jb;
-    
+
     // start of the unblocked block
-    rocblas_int kk = min(k, j + jb); 
+    rocblas_int kk = min(k, j + jb);
 
     rocblas_int blocksy, blocksx;
-    
-    // compute the unblockled part and set to zero the 
+
+    // compute the unblockled part and set to zero the
     // corresponding top submatrix
     if (kk < n) {
         blocksx = (kk - 1)/32 + 1;
         blocksy = (n- kk - 1)/32 + 1;
         hipLaunchKernelGGL(set_zero_col<T>,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,
                            n,kk,A,shiftA,lda,strideA);
-        
-        rocsolver_org2r_template<T>(handle, m - kk, n - kk, k - kk, 
-                                    A, shiftA + idx2D(kk, kk, lda), lda, 
+
+        rocsolver_org2r_template<T>(handle, m - kk, n - kk, k - kk,
+                                    A, shiftA + idx2D(kk, kk, lda), lda,
                                     strideA, (ipiv + kk), strideP, batch_count);
     }
 
     // compute the blocked part
     while (j >= 0) {
-        
+
         // first update the already computed part
         // applying the current block reflector using larft + larfb
         if (j + jb < n) {
-            rocsolver_larft_template<T>(handle, rocsolver_forward_direction, 
-                                        rocsolver_column_wise, m-j, jb, 
-                                        A, shiftA + idx2D(j,j,lda), lda, strideA, 
+            rocsolver_larft_template<T>(handle, rocsolver_forward_direction,
+                                        rocsolver_column_wise, m-j, jb,
+                                        A, shiftA + idx2D(j,j,lda), lda, strideA,
                                         (ipiv + j), strideP,
                                         work, ldw, strideW, batch_count);
 
@@ -109,13 +109,13 @@ rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver
             hipLaunchKernelGGL(set_zero_col<T>,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,
                                j+jb,j,A,shiftA,lda,strideA);
         }
-        rocsolver_org2r_template<T>(handle, m - j, jb, jb, 
-                                    A, shiftA + idx2D(j, j, lda), lda, 
+        rocsolver_org2r_template<T>(handle, m - j, jb, jb,
+                                    A, shiftA + idx2D(j, j, lda), lda,
                                     strideA, (ipiv + j), strideP, batch_count);
 
         j -= jb;
     }
- 
+
     hipFree(work);
 
     return rocblas_status_success;
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orm2r.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orm2r.cpp
index 34ee185b..fdaa1724 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orm2r.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orm2r.cpp
@@ -5,8 +5,8 @@
 #include "rocauxiliary_orm2r.hpp"
 
 template <typename T>
-rocblas_status rocsolver_orm2r_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, 
-                                   const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_orm2r_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans,
+                                   const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv, T *C, const rocsolver_int ldc)
 {
     if(!handle)
@@ -35,7 +35,7 @@ rocblas_status rocsolver_orm2r_impl(rocsolver_handle handle, const rocsolver_sid
                                       strideA,
                                       ipiv,
                                       strideP,
-                                      C,0,  
+                                      C,0,
                                       ldc,
                                       strideC,
                                       batch_count);
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orm2r.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orm2r.hpp
index 10522f08..dd83c375 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orm2r.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_orm2r.hpp
@@ -18,10 +18,10 @@
 #include "../auxiliary/rocauxiliary_larf.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, 
-                                   const rocsolver_int m, const rocsolver_int n, 
-                                   const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda, 
-                                   const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans,
+                                   const rocsolver_int m, const rocsolver_int n,
+                                   const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda,
+                                   const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, U C, const rocsolver_int shiftC, const rocsolver_int ldc,
                                    const rocsolver_int strideC, const rocsolver_int batch_count)
 {
@@ -72,14 +72,14 @@ rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver
             ncol = n - i;
             jc = i;
         }
-    
-        // insert one in A(i,i) tobuild/apply the householder matrix 
+
+        // insert one in A(i,i) tobuild/apply the householder matrix
         hipLaunchKernelGGL(set_one_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(i,i,lda),strideA);
 
-        // Apply current Householder reflector 
+        // Apply current Householder reflector
         rocsolver_larf_template(handle,side,                        //side
                                 nrow,                               //number of rows of matrix to modify
-                                ncol,                               //number of columns of matrix to modify    
+                                ncol,                               //number of columns of matrix to modify
                                 A, shiftA + idx2D(i,i,lda),         //householder vector x
                                 1, strideA,                         //inc of x
                                 (ipiv + i), strideP,                //householder scalar (alpha)
@@ -90,7 +90,7 @@ rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver
         // restore original value of A(i,i)
         hipLaunchKernelGGL(restore_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(i,i,lda),strideA);
     }
- 
+
     return rocblas_status_success;
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_ormqr.cpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_ormqr.cpp
index 7d11d5e6..820f4a46 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_ormqr.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_ormqr.cpp
@@ -5,8 +5,8 @@
 #include "rocauxiliary_ormqr.hpp"
 
 template <typename T>
-rocblas_status rocsolver_ormqr_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, 
-                                   const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_ormqr_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans,
+                                   const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv, T *C, const rocsolver_int ldc)
 {
     if(!handle)
@@ -35,7 +35,7 @@ rocblas_status rocsolver_ormqr_impl(rocsolver_handle handle, const rocsolver_sid
                                       strideA,
                                       ipiv,
                                       strideP,
-                                      C,0,  
+                                      C,0,
                                       ldc,
                                       strideC,
                                       batch_count);
diff --git a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_ormqr.hpp b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_ormqr.hpp
index fd0b523c..b24d77cd 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_ormqr.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/auxiliary/rocauxiliary_ormqr.hpp
@@ -20,10 +20,10 @@
 #include "../auxiliary/rocauxiliary_larft.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, 
-                                   const rocsolver_int m, const rocsolver_int n, 
-                                   const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda, 
-                                   const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans,
+                                   const rocsolver_int m, const rocsolver_int n,
+                                   const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda,
+                                   const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, U C, const rocsolver_int shiftC, const rocsolver_int ldc,
                                    const rocsolver_int strideC, const rocsolver_int batch_count)
 {
@@ -35,14 +35,14 @@ rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver
     rocblas_get_stream(handle, &stream);
 
     // if the matrix is small, use the unblocked variant of the algorithm
-    if (k <= ORMQR_ORM2R_BLOCKSIZE) 
+    if (k <= ORMQR_ORM2R_BLOCKSIZE)
         return rocsolver_orm2r_template<T>(handle, side, trans, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, C, shiftC, ldc, strideC, batch_count);
 
     //memory in GPU (workspace)
     T* work;
     rocblas_int ldw = ORMQR_ORM2R_BLOCKSIZE;
     rocblas_int strideW = ldw *ldw;
-    hipMalloc(&work, sizeof(T)*strideW*batch_count);    
+    hipMalloc(&work, sizeof(T)*strideW*batch_count);
 
     // determine limits and indices
     bool left = (side == rocblas_side_left);
@@ -100,7 +100,7 @@ rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver
                                  C, shiftC + idx2D(ic,jc,ldc),ldc,strideC,
                                  batch_count);
     }
- 
+
     return rocblas_status_success;
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/library/src/common/rocblas.cpp b/ROCm_Libraries/rocSOLVER/library/src/common/rocblas.cpp
index 2d57c7d9..65dd0697 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/common/rocblas.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/common/rocblas.cpp
@@ -104,7 +104,7 @@ rocblas_status rocblas_iamax(rocblas_handle handle, rocblas_int n,
   return rocblas_izamax(handle, n, x, incx, result);
 }
 
-//ger 
+//ger
 
 template <>
 rocblas_status rocblas_ger<false>(rocblas_handle handle, rocblas_int m, rocblas_int n,
diff --git a/ROCm_Libraries/rocSOLVER/library/src/include/common_device.hpp b/ROCm_Libraries/rocSOLVER/library/src/include/common_device.hpp
index 1aaaab61..d28acb79 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/include/common_device.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/include/common_device.hpp
@@ -36,16 +36,16 @@ __forceinline__ __device__ __host__ T* load_ptr_batch(T *const p[], rocblas_int
 }
 
 template<typename T>
-__forceinline__ __global__ void get_array(T** out, T* in, rocblas_int stride, rocblas_int batch) 
+__forceinline__ __global__ void get_array(T** out, T* in, rocblas_int stride, rocblas_int batch)
 {
     int b = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-    
+
     if (b < batch)
         out[b] = in + b*stride;
 }
 
 template <typename T, typename U>
-__forceinline__ __global__ void setdiag(const rocblas_int j, U A, 
+__forceinline__ __global__ void setdiag(const rocblas_int j, U A,
                         const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA,
                         T *ipiv, const rocblas_int strideP)
 {
@@ -54,7 +54,7 @@ __forceinline__ __global__ void setdiag(const rocblas_int j, U A,
     T *tau = ipiv + b*strideP;
 
     T t = -tau[j];
-    tau[j] = t; 
+    tau[j] = t;
     Ap[j + j*lda] = 1.0 + t;
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/library/src/include/ideal_sizes.hpp b/ROCm_Libraries/rocSOLVER/library/src/include/ideal_sizes.hpp
index 5d9cf574..260d9d1f 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/include/ideal_sizes.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/include/ideal_sizes.hpp
@@ -8,7 +8,7 @@
 
 // IDEAL SIZES ARE DEFINED FOR NOW AS IN CPU-LAPACK
 // BENCHMARKING OF ROCSOLVER WILL BE NEEDED TO DETERMINE
-// MORE SUITABLE VALUES  
+// MORE SUITABLE VALUES
 
 
 
diff --git a/ROCm_Libraries/rocSOLVER/library/src/include/rocsolver_unique_ptr.hpp b/ROCm_Libraries/rocSOLVER/library/src/include/rocsolver_unique_ptr.hpp
index 185d1690..b7e34f6b 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/include/rocsolver_unique_ptr.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/include/rocsolver_unique_ptr.hpp
@@ -1,24 +1,24 @@
-/* ************************************************************************
- * Copyright 2019-2020 Advanced Micro Devices, Inc.
- * ************************************************************************ */
-
-#ifndef GUARD_ROCBLAS_MANAGE_PTR_HPP
-#define GUARD_ROCBLAS_MANAGE_PTR_HPP
-
-#include <memory>
-
-namespace rocsolver {
-// device_malloc wraps hipMalloc and provides same API as malloc
-static void *device_malloc(size_t byte_size) {
-  void *pointer;
-  PRINT_IF_HIP_ERROR(hipMalloc(&pointer, byte_size));
-  return pointer;
-}
-
-// device_free wraps hipFree and provides same API as free
-static void device_free(void *ptr) { PRINT_IF_HIP_ERROR(hipFree(ptr)); }
-} // namespace rocsolver
-
-using rocsolver_unique_ptr = std::unique_ptr<void, void (*)(void *)>;
-
-#endif
+/* ************************************************************************
+ * Copyright 2019-2020 Advanced Micro Devices, Inc.
+ * ************************************************************************ */
+
+#ifndef GUARD_ROCBLAS_MANAGE_PTR_HPP
+#define GUARD_ROCBLAS_MANAGE_PTR_HPP
+
+#include <memory>
+
+namespace rocsolver {
+// device_malloc wraps hipMalloc and provides same API as malloc
+static void *device_malloc(size_t byte_size) {
+  void *pointer;
+  PRINT_IF_HIP_ERROR(hipMalloc(&pointer, byte_size));
+  return pointer;
+}
+
+// device_free wraps hipFree and provides same API as free
+static void device_free(void *ptr) { PRINT_IF_HIP_ERROR(hipFree(ptr)); }
+} // namespace rocsolver
+
+using rocsolver_unique_ptr = std::unique_ptr<void, void (*)(void *)>;
+
+#endif
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2.cpp
index d412d69a..f5f6d466 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_gelq2_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv) 
-{ 
+                                        T* ipiv)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m)
@@ -41,13 +41,13 @@ rocblas_status rocsolver_gelq2_impl(rocblas_handle handle, const rocblas_int m,
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgelq2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, float *ipiv) 
+                 const rocblas_int lda, float *ipiv)
 {
     return rocsolver_gelq2_impl<float>(handle, m, n, A, lda, ipiv);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgelq2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, double *ipiv) 
+                 const rocblas_int lda, double *ipiv)
 {
     return rocsolver_gelq2_impl<double>(handle, m, n, A, lda, ipiv);
 }
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2.hpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2.hpp
index 29c4266f..81ec19ae 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2.hpp
@@ -22,12 +22,12 @@
 
 template <typename T, typename U>
 rocblas_status rocsolver_gelq2_template(rocblas_handle handle, const rocblas_int m,
-                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, 
-                                        rocblas_int const strideA, T* ipiv,  
+                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda,
+                                        rocblas_int const strideA, T* ipiv,
                                         const rocblas_int strideP, const rocblas_int batch_count)
 {
     // quick return
-    if (m == 0 || n == 0 || batch_count == 0) 
+    if (m == 0 || n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     hipStream_t stream;
@@ -36,8 +36,8 @@ rocblas_status rocsolver_gelq2_template(rocblas_handle handle, const rocblas_int
     //memory in GPU (workspace)
     T *diag;
     hipMalloc(&diag,sizeof(T)*batch_count);
-   
-    rocblas_int dim = min(m, n);    //total number of pivots    
+
+    rocblas_int dim = min(m, n);    //total number of pivots
 
     for (rocblas_int j = 0; j < dim; ++j) {
         // generate Householder reflector to work on row j
@@ -45,18 +45,18 @@ rocblas_status rocsolver_gelq2_template(rocblas_handle handle, const rocblas_int
                                  n - j,                                 //order of reflector
                                  A, shiftA + idx2D(j,j,lda),            //value of alpha
                                  A, shiftA + idx2D(j,min(j+1,n-1),lda), //vector x to work on
-                                 lda, strideA,                          //inc of x    
+                                 lda, strideA,                          //inc of x
                                  (ipiv + j), strideP,                   //tau
                                  batch_count);
 
-        // insert one in A(j,j) tobuild/apply the householder matrix 
+        // insert one in A(j,j) tobuild/apply the householder matrix
         hipLaunchKernelGGL(set_one_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(j,j,lda),strideA);
 
-        // Apply Householder reflector to the rest of matrix from the right 
+        // Apply Householder reflector to the rest of matrix from the right
         if (j < m - 1) {
             rocsolver_larf_template(handle,rocblas_side_right,          //side
                                     m - j - 1,                          //number of rows of matrix to modify
-                                    n - j,                              //number of columns of matrix to modify    
+                                    n - j,                              //number of columns of matrix to modify
                                     A, shiftA + idx2D(j,j,lda),         //householder vector x
                                     lda, strideA,                       //inc of x
                                     (ipiv + j), strideP,                //householder scalar (alpha)
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2_batched.cpp
index 027572df..35fe7af5 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2_batched.cpp
@@ -8,13 +8,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_gelq2_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -40,13 +40,13 @@ rocblas_status rocsolver_gelq2_batched_impl(rocblas_handle handle, const rocblas
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgelq2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[],
-                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelq2_batched_impl<float>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgelq2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[],
-                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelq2_batched_impl<double>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2_strided_batched.cpp
index 9eefcb03..569facbb 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelq2_strided_batched.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_gelq2_strided_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -38,13 +38,13 @@ rocblas_status rocsolver_gelq2_strided_batched_impl(rocblas_handle handle, const
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgelq2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelq2_strided_batched_impl<float>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgelq2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelq2_strided_batched_impl<double>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf.cpp
index a29c5b0f..f75a0da7 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_gelqf_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv) 
-{ 
+                                        T* ipiv)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m)
@@ -41,13 +41,13 @@ rocblas_status rocsolver_gelqf_impl(rocblas_handle handle, const rocblas_int m,
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgelqf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, float *ipiv) 
+                 const rocblas_int lda, float *ipiv)
 {
     return rocsolver_gelqf_impl<float>(handle, m, n, A, lda, ipiv);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgelqf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, double *ipiv) 
+                 const rocblas_int lda, double *ipiv)
 {
     return rocsolver_gelqf_impl<double>(handle, m, n, A, lda, ipiv);
 }
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf.hpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf.hpp
index b0e15bef..d40b9dd5 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf.hpp
@@ -24,21 +24,21 @@
 
 template <typename T, typename U>
 rocblas_status rocsolver_gelqf_template(rocblas_handle handle, const rocblas_int m,
-                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, 
-                                        rocblas_int const strideA, T* ipiv,  
+                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda,
+                                        rocblas_int const strideA, T* ipiv,
                                         const rocblas_int strideP, const rocblas_int batch_count)
 {
     // quick return
-    if (m == 0 || n == 0 || batch_count == 0) 
+    if (m == 0 || n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
 
     // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm
-    if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE) 
+    if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE)
         return rocsolver_gelq2_template<T>(handle, m, n, A, shiftA, lda, strideA, ipiv, strideP, batch_count);
-    
+
     rocblas_int dim = min(m, n);    //total number of pivots
     rocblas_int jb, j = 0;
 
@@ -49,17 +49,17 @@ rocblas_status rocsolver_gelqf_template(rocblas_handle handle, const rocblas_int
     hipMalloc(&work, sizeof(T)*strideW*batch_count);
 
     while (j < dim - GEQRF_GEQR2_SWITCHSIZE) {
-        // Factor diagonal and subdiagonal blocks 
+        // Factor diagonal and subdiagonal blocks
         jb = min(dim - j, GEQRF_GEQR2_BLOCKSIZE);  //number of rows in the block
         rocsolver_gelq2_template<T>(handle, jb, n-j, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count);
 
         //apply transformation to the rest of the matrix
         if (j + jb < m) {
-            
+
             //compute block reflector
-            rocsolver_larft_template<T>(handle, rocsolver_forward_direction, 
-                                        rocsolver_row_wise, n-j, jb, 
-                                        A, shiftA + idx2D(j,j,lda), lda, strideA, 
+            rocsolver_larft_template<T>(handle, rocsolver_forward_direction,
+                                        rocsolver_row_wise, n-j, jb,
+                                        A, shiftA + idx2D(j,j,lda), lda, strideA,
                                         (ipiv + j), strideP,
                                         work, ldw, strideW, batch_count);
 
@@ -76,9 +76,9 @@ rocblas_status rocsolver_gelqf_template(rocblas_handle handle, const rocblas_int
     }
 
     //factor last block
-    if (j < dim) 
+    if (j < dim)
         rocsolver_gelq2_template<T>(handle, m-j, n-j, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count);
-        
+
     hipFree(work);
 
     return rocblas_status_success;
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf_batched.cpp
index 91631008..cee74932 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf_batched.cpp
@@ -8,13 +8,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_gelqf_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -40,13 +40,13 @@ rocblas_status rocsolver_gelqf_batched_impl(rocblas_handle handle, const rocblas
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgelqf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[],
-                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelqf_batched_impl<float>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgelqf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[],
-                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelqf_batched_impl<double>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf_strided_batched.cpp
index 13e0312f..a5581819 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_gelqf_strided_batched.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_gelqf_strided_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -38,13 +38,13 @@ rocblas_status rocsolver_gelqf_strided_batched_impl(rocblas_handle handle, const
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgelqf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelqf_strided_batched_impl<float>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgelqf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelqf_strided_batched_impl<double>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2.cpp
index 0cae47b0..249784a0 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_geqr2_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv) 
-{ 
+                                        T* ipiv)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m)
@@ -41,13 +41,13 @@ rocblas_status rocsolver_geqr2_impl(rocblas_handle handle, const rocblas_int m,
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqr2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, float *ipiv) 
+                 const rocblas_int lda, float *ipiv)
 {
     return rocsolver_geqr2_impl<float>(handle, m, n, A, lda, ipiv);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqr2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, double *ipiv) 
+                 const rocblas_int lda, double *ipiv)
 {
     return rocsolver_geqr2_impl<double>(handle, m, n, A, lda, ipiv);
 }
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2.hpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2.hpp
index 668fc8a0..485550d7 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2.hpp
@@ -22,12 +22,12 @@
 
 template <typename T, typename U>
 rocblas_status rocsolver_geqr2_template(rocblas_handle handle, const rocblas_int m,
-                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, 
-                                        rocblas_int const strideA, T* ipiv,  
+                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda,
+                                        rocblas_int const strideA, T* ipiv,
                                         const rocblas_int strideP, const rocblas_int batch_count)
 {
     // quick return
-    if (m == 0 || n == 0 || batch_count == 0) 
+    if (m == 0 || n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     hipStream_t stream;
@@ -36,8 +36,8 @@ rocblas_status rocsolver_geqr2_template(rocblas_handle handle, const rocblas_int
     //memory in GPU (workspace)
     T *diag;
     hipMalloc(&diag,sizeof(T)*batch_count);
-   
-    rocblas_int dim = min(m, n);    //total number of pivots    
+
+    rocblas_int dim = min(m, n);    //total number of pivots
 
     for (rocblas_int j = 0; j < dim; ++j) {
         // generate Householder reflector to work on column j
@@ -45,18 +45,18 @@ rocblas_status rocsolver_geqr2_template(rocblas_handle handle, const rocblas_int
                                  m - j,                                 //order of reflector
                                  A, shiftA + idx2D(j,j,lda),            //value of alpha
                                  A, shiftA + idx2D(min(j+1,m-1),j,lda), //vector x to work on
-                                 1, strideA,                            //inc of x    
+                                 1, strideA,                            //inc of x
                                  (ipiv + j), strideP,                   //tau
                                  batch_count);
 
-        // insert one in A(j,j) tobuild/apply the householder matrix 
+        // insert one in A(j,j) tobuild/apply the householder matrix
         hipLaunchKernelGGL(set_one_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(j,j,lda),strideA);
 
-        // Apply Householder reflector to the rest of matrix from the left 
+        // Apply Householder reflector to the rest of matrix from the left
         if (j < n - 1) {
             rocsolver_larf_template(handle,rocblas_side_left,           //side
                                     m - j,                              //number of rows of matrix to modify
-                                    n - j - 1,                          //number of columns of matrix to modify    
+                                    n - j - 1,                          //number of columns of matrix to modify
                                     A, shiftA + idx2D(j,j,lda),         //householder vector x
                                     1, strideA,                         //inc of x
                                     (ipiv + j), strideP,                //householder scalar (alpha)
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2_batched.cpp
index ef67a2eb..70e765e8 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2_batched.cpp
@@ -8,13 +8,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_geqr2_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -40,13 +40,13 @@ rocblas_status rocsolver_geqr2_batched_impl(rocblas_handle handle, const rocblas
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqr2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[],
-                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqr2_batched_impl<float>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqr2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[],
-                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqr2_batched_impl<double>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2_strided_batched.cpp
index 26816634..e468de7e 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqr2_strided_batched.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_geqr2_strided_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -38,13 +38,13 @@ rocblas_status rocsolver_geqr2_strided_batched_impl(rocblas_handle handle, const
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqr2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqr2_strided_batched_impl<float>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqr2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqr2_strided_batched_impl<double>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf.cpp
index d941c762..b91aa412 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_geqrf_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv) 
-{ 
+                                        T* ipiv)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m)
@@ -41,13 +41,13 @@ rocblas_status rocsolver_geqrf_impl(rocblas_handle handle, const rocblas_int m,
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqrf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, float *ipiv) 
+                 const rocblas_int lda, float *ipiv)
 {
     return rocsolver_geqrf_impl<float>(handle, m, n, A, lda, ipiv);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqrf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, double *ipiv) 
+                 const rocblas_int lda, double *ipiv)
 {
     return rocsolver_geqrf_impl<double>(handle, m, n, A, lda, ipiv);
 }
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf.hpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf.hpp
index fcdb4935..e1a3adaf 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf.hpp
@@ -24,21 +24,21 @@
 
 template <typename T, typename U>
 rocblas_status rocsolver_geqrf_template(rocblas_handle handle, const rocblas_int m,
-                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, 
-                                        rocblas_int const strideA, T* ipiv,  
+                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda,
+                                        rocblas_int const strideA, T* ipiv,
                                         const rocblas_int strideP, const rocblas_int batch_count)
 {
     // quick return
-    if (m == 0 || n == 0 || batch_count == 0) 
+    if (m == 0 || n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
 
     // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm
-    if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE) 
+    if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE)
         return rocsolver_geqr2_template<T>(handle, m, n, A, shiftA, lda, strideA, ipiv, strideP, batch_count);
-    
+
     rocblas_int dim = min(m, n);    //total number of pivots
     rocblas_int jb, j = 0;
 
@@ -49,17 +49,17 @@ rocblas_status rocsolver_geqrf_template(rocblas_handle handle, const rocblas_int
     hipMalloc(&work, sizeof(T)*strideW*batch_count);
 
     while (j < dim - GEQRF_GEQR2_SWITCHSIZE) {
-        // Factor diagonal and subdiagonal blocks 
+        // Factor diagonal and subdiagonal blocks
         jb = min(dim - j, GEQRF_GEQR2_BLOCKSIZE);  //number of columns in the block
         rocsolver_geqr2_template<T>(handle, m-j, jb, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count);
 
         //apply transformation to the rest of the matrix
         if (j + jb < n) {
-            
+
             //compute block reflector
-            rocsolver_larft_template<T>(handle, rocsolver_forward_direction, 
-                                        rocsolver_column_wise, m-j, jb, 
-                                        A, shiftA + idx2D(j,j,lda), lda, strideA, 
+            rocsolver_larft_template<T>(handle, rocsolver_forward_direction,
+                                        rocsolver_column_wise, m-j, jb,
+                                        A, shiftA + idx2D(j,j,lda), lda, strideA,
                                         (ipiv + j), strideP,
                                         work, ldw, strideW, batch_count);
 
@@ -75,9 +75,9 @@ rocblas_status rocsolver_geqrf_template(rocblas_handle handle, const rocblas_int
     }
 
     //factor last block
-    if (j < dim) 
+    if (j < dim)
         rocsolver_geqr2_template<T>(handle, m-j, n-j, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count);
-        
+
     hipFree(work);
 
     return rocblas_status_success;
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf_batched.cpp
index 3ae16e6a..41bb01e6 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf_batched.cpp
@@ -8,13 +8,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_geqrf_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -40,13 +40,13 @@ rocblas_status rocsolver_geqrf_batched_impl(rocblas_handle handle, const rocblas
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqrf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[],
-                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqrf_batched_impl<float>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqrf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[],
-                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqrf_batched_impl<double>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf_strided_batched.cpp
index b3e3809d..bd670e1f 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_geqrf_strided_batched.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_geqrf_strided_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -38,13 +38,13 @@ rocblas_status rocsolver_geqrf_strided_batched_impl(rocblas_handle handle, const
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqrf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqrf_strided_batched_impl<float>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqrf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqrf_strided_batched_impl<double>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2.cpp
index 9b01a5af..d74da116 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_getf2_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        rocblas_int *ipiv, rocblas_int* info) 
-{ 
+                                        rocblas_int *ipiv, rocblas_int* info)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv || !info)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || lda < 1)
@@ -41,25 +41,25 @@ rocblas_status rocsolver_getf2_impl(rocblas_handle handle, const rocblas_int m,
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info) 
+                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info)
 {
     return rocsolver_getf2_impl<float>(handle, m, n, A, lda, ipiv, info);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info ) 
+                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info )
 {
     return rocsolver_getf2_impl<double>(handle, m, n, A, lda, ipiv, info);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_cgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *A,
-                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info) 
+                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info)
 {
     return rocsolver_getf2_impl<rocblas_float_complex>(handle, m, n, A, lda, ipiv, info);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *A,
-                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info ) 
+                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info )
 {
     return rocsolver_getf2_impl<rocblas_double_complex>(handle, m, n, A, lda, ipiv, info);
 }
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2.hpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2.hpp
index 727a76c3..5630004e 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2.hpp
@@ -44,14 +44,14 @@ inline __global__ void getf2_check_singularity(U AA, const rocblas_int shiftA, c
 
 template <typename T, typename U>
 rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int m,
-                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, 
-                                        rocblas_int const strideA, rocblas_int *ipiv, const rocblas_int shiftP, 
+                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda,
+                                        rocblas_int const strideA, rocblas_int *ipiv, const rocblas_int shiftP,
                                         const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     // quick return
-    if (m == 0 || n == 0 || batch_count == 0) 
+    if (m == 0 || n == 0 || batch_count == 0)
         return rocblas_status_success;
-    
+
     #ifdef batched
         // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL
         //      BATCH-BLAS FUNCTIONALITY IS ENABLED. ****
@@ -69,7 +69,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int
     hipMemcpy(minoneInt, &minone, sizeof(T), hipMemcpyHostToDevice);
 
     //pivoting info in device (to avoid continuous synchronization with CPU)
-    T *pivotGPU; 
+    T *pivotGPU;
     hipMalloc(&pivotGPU, sizeof(T)*batch_count);
 
     hipStream_t stream;
@@ -84,7 +84,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int
 
     //info=0 (starting with a nonsingular matrix)
     hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,info,batch_count,0);
-    
+
     // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS
     //      FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
@@ -93,7 +93,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int
         // find pivot. Use Fortran 1-based indexing for the ipiv array as iamax does that as well!
         for (int b=0;b<batch_count;++b) {
             M = load_ptr_batch<T>(AA,shiftA,b,strideA);
-            rocblas_iamax(handle, m - j, (M + idx2D(j, j, lda)), 1, 
+            rocblas_iamax(handle, m - j, (M + idx2D(j, j, lda)), 1,
                         (ipiv + shiftP + b*strideP + j));
         }
 
@@ -101,14 +101,14 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int
         hipLaunchKernelGGL(getf2_check_singularity<T>, dim3(batch_count), dim3(1), 0, stream,
                   A, shiftA, strideA, ipiv, shiftP, strideP, j, lda, pivotGPU, info);
 
-        // Swap pivot row and j-th row 
+        // Swap pivot row and j-th row
         rocsolver_laswp_template<T>(handle, n, A, shiftA, lda, strideA, j+1, j+1, ipiv, shiftP, strideP, 1, batch_count);
 
         // Compute elements J+1:M of J'th column
         for (int b=0;b<batch_count;++b) {
             M = load_ptr_batch<T>(AA,shiftA,b,strideA);
-            rocblas_scal(handle, (m-j-1), (pivotGPU + b), 
-                            (M + idx2D(j + 1, j, lda)), oneInt); 
+            rocblas_scal(handle, (m-j-1), (pivotGPU + b),
+                            (M + idx2D(j + 1, j, lda)), oneInt);
         }
 
         // update trailing submatrix
@@ -116,7 +116,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int
             for (int b=0;b<batch_count;++b) {
                 M = load_ptr_batch<T>(AA,shiftA,b,strideA);
                 rocblas_ger<false>(handle, m - j - 1, n - j - 1, minoneInt,
-                        (M + idx2D(j + 1, j, lda)), oneInt, 
+                        (M + idx2D(j + 1, j, lda)), oneInt,
                         (M + idx2D(j, j + 1, lda)), lda,
                         (M + idx2D(j + 1, j + 1, lda)), lda);
             }
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2_batched.cpp
index bd9e7240..462e932d 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2_batched.cpp
@@ -8,14 +8,14 @@
 template <typename T, typename U>
 rocblas_status rocsolver_getf2_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
-{ 
+                                        rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
+{
 
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv || !info)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -40,25 +40,25 @@ rocblas_status rocsolver_getf2_batched_impl(rocblas_handle handle, const rocblas
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[],
-                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_batched_impl<float>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[],
-                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_batched_impl<double>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_cgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *const A[],
-                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_batched_impl<rocblas_float_complex>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *const A[],
-                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_batched_impl<rocblas_double_complex>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2_strided_batched.cpp
index ccb2d252..b3ea05e9 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getf2_strided_batched.cpp
@@ -7,19 +7,19 @@
 template <typename T, typename U>
 rocblas_status rocsolver_getf2_strided_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
-                                        rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
-{ 
+                                        rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
+{
 
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv || !info)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
         return rocblas_status_invalid_size;
-        
+
 
     return rocsolver_getf2_template<T>(handle,m,n,
                                     A,0,    //the matrix is shifted 0 entries (will work on the entire matrix)
@@ -39,25 +39,25 @@ rocblas_status rocsolver_getf2_strided_batched_impl(rocblas_handle handle, const
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_strided_batched_impl<float>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_strided_batched_impl<double>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_cgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *A,
-                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_strided_batched_impl<rocblas_float_complex>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *A,
-                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_strided_batched_impl<rocblas_double_complex>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf.cpp
index 4a1c1b91..9b3bdf70 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf.cpp
@@ -10,10 +10,10 @@ rocblas_status rocsolver_getrf_impl(rocblas_handle handle, const rocblas_int m,
                                         rocblas_int *ipiv, rocblas_int* info) {
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
 
-    if (m < 0 || n < 0 || lda < m) 
+    //logging is missing ???
+
+    if (m < 0 || n < 0 || lda < m)
         return rocblas_status_invalid_size;
     if (!A || !ipiv || !info)
         return rocblas_status_invalid_pointer;
@@ -40,25 +40,25 @@ rocblas_status rocsolver_getrf_impl(rocblas_handle handle, const rocblas_int m,
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 float *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) 
+                 float *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info)
 {
     return rocsolver_getrf_impl<float>(handle, m, n, A, lda, ipiv, info);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 double *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) 
+                 double *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info)
 {
     return rocsolver_getrf_impl<double>(handle, m, n, A, lda, ipiv, info);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_cgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 rocblas_float_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) 
+                 rocblas_float_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info)
 {
     return rocsolver_getrf_impl<rocblas_float_complex>(handle, m, n, A, lda, ipiv, info);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 rocblas_double_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) 
+                 rocblas_double_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info)
 {
     return rocsolver_getrf_impl<rocblas_double_complex>(handle, m, n, A, lda, ipiv, info);
 }
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf.hpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf.hpp
index f19138bb..395fd187 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf.hpp
@@ -41,13 +41,13 @@ rocblas_status rocsolver_getrf_template(rocblas_handle handle, const rocblas_int
                                         const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA,
                                         rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int *info, const rocblas_int batch_count) {
     // quick return
-    if (m == 0 || n == 0 || batch_count == 0) 
+    if (m == 0 || n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm
-    if (m < GETRF_GETF2_SWITCHSIZE || n < GETRF_GETF2_SWITCHSIZE) 
+    if (m < GETRF_GETF2_SWITCHSIZE || n < GETRF_GETF2_SWITCHSIZE)
         return rocsolver_getf2_template<T>(handle, m, n, A, shiftA, lda, strideA, ipiv, shiftP, strideP, info, batch_count);
-  
+
     #ifdef batched
         // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL
         //      BATCH-BLAS FUNCTIONALITY IS ENABLED. ****
@@ -92,14 +92,14 @@ rocblas_status rocsolver_getrf_template(rocblas_handle handle, const rocblas_int
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
 
     for (int j = 0; j < dim; j += GETRF_GETF2_SWITCHSIZE) {
-        // Factor diagonal and subdiagonal blocks 
+        // Factor diagonal and subdiagonal blocks
         jb = min(dim - j, GETRF_GETF2_SWITCHSIZE);  //number of columns in the block
         hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,iinfo,batch_count,0);
         rocsolver_getf2_template<T>(handle, m - j, jb, A, shiftA + idx2D(j, j, lda), lda, strideA, ipiv, shiftP + j, strideP, iinfo, batch_count);
-        
+
         // adjust pivot indices and check singularity
         sizePivot = min(m - j, jb);     //number of pivots in the block
-        blocksPivot = (sizePivot - 1) / GETF2_BLOCKSIZE + 1; 
+        blocksPivot = (sizePivot - 1) / GETF2_BLOCKSIZE + 1;
         gridPivot = dim3(blocksPivot, batch_count, 1);
         hipLaunchKernelGGL(getrf_check_singularity, gridPivot, threads, 0, stream, sizePivot, j, ipiv, shiftP + j, strideP, iinfo, info);
 
@@ -131,7 +131,7 @@ rocblas_status rocsolver_getrf_template(rocblas_handle handle, const rocblas_int
                                  (M + idx2D(j + jb, j + jb, lda)), lda);
                 }
             }
-        } 
+        }
     }
 
     hipFree(pivotGPU);
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf_batched.cpp
index 5ed946d0..44317213 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf_batched.cpp
@@ -7,14 +7,14 @@
 
 template <typename T, typename U>
 rocblas_status rocsolver_getrf_batched_impl(rocblas_handle handle, rocblas_int m,
-                                        rocblas_int n, U A, rocblas_int lda, 
+                                        rocblas_int n, U A, rocblas_int lda,
                                         rocblas_int *ipiv, const rocblas_int strideP, rocblas_int* info, rocblas_int batch_count) {
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
 
-    if (m < 0 || n < 0 || batch_count < 0 || lda < m) 
+    //logging is missing ???
+
+    if (m < 0 || n < 0 || batch_count < 0 || lda < m)
         return rocblas_status_invalid_size;
     if (!A || !ipiv || !info)
         return rocblas_status_invalid_pointer;
@@ -39,25 +39,25 @@ rocblas_status rocsolver_getrf_batched_impl(rocblas_handle handle, rocblas_int m
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 float *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 float *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getrf_batched_impl<float>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 double *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count) 
+                 double *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count)
 {
     return rocsolver_getrf_batched_impl<double>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_cgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 rocblas_float_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 rocblas_float_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getrf_batched_impl<rocblas_float_complex>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 rocblas_double_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count) 
+                 rocblas_double_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count)
 {
     return rocsolver_getrf_batched_impl<rocblas_double_complex>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf_strided_batched.cpp
index c1ef590b..35443146 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrf_strided_batched.cpp
@@ -10,10 +10,10 @@ rocblas_status rocsolver_getrf_strided_batched_impl(rocblas_handle handle, const
                                         rocblas_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) {
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
 
-    if (m < 0 || n < 0 || batch_count  < 0 || lda < m) 
+    //logging is missing ???
+
+    if (m < 0 || n < 0 || batch_count  < 0 || lda < m)
         return rocblas_status_invalid_size;
     if (!A || !ipiv || !info)
         return rocblas_status_invalid_pointer;
@@ -36,25 +36,25 @@ rocblas_status rocsolver_getrf_strided_batched_impl(rocblas_handle handle, const
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 float *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 float *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getrf_strided_batched_impl<float>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 double *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 double *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getrf_strided_batched_impl<double>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_cgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 rocblas_float_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 rocblas_float_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getrf_strided_batched_impl<rocblas_float_complex>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 rocblas_double_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 rocblas_double_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getrf_strided_batched_impl<rocblas_double_complex>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs.cpp
index 255e306c..435339c1 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs.cpp
@@ -7,14 +7,14 @@
 template <typename T>
 rocblas_status rocsolver_getrs_impl(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, T *A, const rocblas_int lda,
-                 const rocblas_int *ipiv, T *B, const rocblas_int ldb) 
+                 const rocblas_int *ipiv, T *B, const rocblas_int ldb)
 {
     if(!handle)
         return rocblas_status_invalid_handle;
 
-    //logging is missing ???    
+    //logging is missing ???
 
-    if (n < 0 || nrhs < 0 || lda < n || ldb < n) 
+    if (n < 0 || nrhs < 0 || lda < n || ldb < n)
         return rocblas_status_invalid_size;
 
     if (!A || !ipiv || !B)
@@ -45,7 +45,7 @@ rocblas_status rocsolver_getrs_impl(rocblas_handle handle, const rocblas_operati
 extern "C" ROCSOLVER_EXPORT rocblas_status
 rocsolver_sgetrs(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, float *A, const rocblas_int lda,
-                 const rocblas_int *ipiv, float *B, const rocblas_int ldb) 
+                 const rocblas_int *ipiv, float *B, const rocblas_int ldb)
 {
   return rocsolver_getrs_impl<float>(handle, trans, n, nrhs, A, lda, ipiv, B, ldb);
 }
@@ -53,21 +53,21 @@ rocsolver_sgetrs(rocblas_handle handle, const rocblas_operation trans, const roc
 extern "C" ROCSOLVER_EXPORT rocblas_status
 rocsolver_dgetrs(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, double *A, const rocblas_int lda,
-                 const rocblas_int *ipiv, double *B, const rocblas_int ldb) 
+                 const rocblas_int *ipiv, double *B, const rocblas_int ldb)
 {
   return rocsolver_getrs_impl<double>(handle, trans, n, nrhs, A, lda, ipiv, B, ldb);
 }
 
-extern "C" ROCSOLVER_EXPORT rocsolver_status 
+extern "C" ROCSOLVER_EXPORT rocsolver_status
 rocsolver_cgetrs(
     rocsolver_handle handle, const rocsolver_operation trans, const rocsolver_int n,
     const rocsolver_int nrhs, rocblas_float_complex *A, const rocsolver_int lda,
-    const rocsolver_int *ipiv, rocblas_float_complex *B, const rocsolver_int ldb) 
+    const rocsolver_int *ipiv, rocblas_float_complex *B, const rocsolver_int ldb)
 {
   return rocsolver_getrs_impl<rocblas_float_complex>(handle, trans, n, nrhs, A, lda, ipiv, B, ldb);
 }
 
-extern "C" ROCSOLVER_EXPORT rocsolver_status 
+extern "C" ROCSOLVER_EXPORT rocsolver_status
 rocsolver_zgetrs(
     rocsolver_handle handle, const rocsolver_operation trans, const rocsolver_int n,
     const rocsolver_int nrhs, rocblas_double_complex *A, const rocsolver_int lda,
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs.hpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs.hpp
index 1209770f..e18816df 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs.hpp
@@ -19,7 +19,7 @@ template <typename T, typename U>
 rocblas_status rocsolver_getrs_template(rocblas_handle handle, const rocblas_operation trans,
                          const rocblas_int n, const rocblas_int nrhs, U A, const rocblas_int shiftA,
                          const rocblas_int lda, const rocblas_int strideA, const rocblas_int *ipiv, const rocblas_int strideP, U B,
-                         const rocblas_int shiftB, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) 
+                         const rocblas_int shiftB, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count)
 {
     // quick return
     if (n == 0 || nrhs == 0 || batch_count == 0) {
@@ -56,7 +56,7 @@ rocblas_status rocsolver_getrs_template(rocblas_handle handle, const rocblas_ope
         for (int b = 0; b < batch_count; ++b) {
             Ap = load_ptr_batch<T>(AA,shiftA,b,strideA);
             Bp = load_ptr_batch<T>(BB,shiftB,b,strideB);
-            
+
             // solve L*X = B, overwriting B with X
             rocblas_trsm<T>(handle, rocblas_side_left, rocblas_fill_lower,
                     trans, rocblas_diagonal_unit, n, nrhs,
@@ -67,13 +67,13 @@ rocblas_status rocsolver_getrs_template(rocblas_handle handle, const rocblas_ope
                     trans, rocblas_diagonal_non_unit, n, nrhs,
                     oneInt, Ap, lda, Bp, ldb);
         }
-    
+
     } else {
 
         for (int b = 0; b < batch_count; ++b) {
             Ap = load_ptr_batch<T>(AA,shiftA,b,strideA);
             Bp = load_ptr_batch<T>(BB,shiftB,b,strideB);
-            
+
             // solve U**T *X = B or U**H *X = B, overwriting B with X
             rocblas_trsm<T>(handle, rocblas_side_left, rocblas_fill_upper, trans,
                     rocblas_diagonal_non_unit, n, nrhs,
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs_batched.cpp
index dd2dbe6a..43d48ac5 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs_batched.cpp
@@ -8,14 +8,14 @@
 template <typename T, typename U>
 rocblas_status rocsolver_getrs_batched_impl(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, U A, const rocblas_int lda,
-                 const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int batch_count) 
+                 const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int batch_count)
 {
     if(!handle)
         return rocblas_status_invalid_handle;
 
-    //logging is missing ???    
+    //logging is missing ???
 
-    if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0) 
+    if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0)
         return rocblas_status_invalid_size;
 
     if (!A || !ipiv || !B)
@@ -44,7 +44,7 @@ rocblas_status rocsolver_getrs_batched_impl(rocblas_handle handle, const rocblas
 extern "C" ROCSOLVER_EXPORT rocblas_status
 rocsolver_sgetrs_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, float *const A[], const rocblas_int lda,
-                 const rocblas_int *ipiv, const rocblas_int strideP, float *const B[], const rocblas_int ldb, const rocblas_int batch_count) 
+                 const rocblas_int *ipiv, const rocblas_int strideP, float *const B[], const rocblas_int ldb, const rocblas_int batch_count)
 {
   return rocsolver_getrs_batched_impl<float>(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count);
 }
@@ -52,26 +52,26 @@ rocsolver_sgetrs_batched(rocblas_handle handle, const rocblas_operation trans, c
 extern "C" ROCSOLVER_EXPORT rocblas_status
 rocsolver_dgetrs_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, double *const A[], const rocblas_int lda,
-                 const rocblas_int *ipiv, const rocblas_int strideP, double *const B[], const rocblas_int ldb, const rocblas_int batch_count) 
+                 const rocblas_int *ipiv, const rocblas_int strideP, double *const B[], const rocblas_int ldb, const rocblas_int batch_count)
 {
   return rocsolver_getrs_batched_impl<double>(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count);
 }
 
-extern "C" ROCSOLVER_EXPORT rocsolver_status 
+extern "C" ROCSOLVER_EXPORT rocsolver_status
 rocsolver_cgetrs_batched(
                  rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, rocblas_float_complex *const A[], const rocblas_int lda,
-                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *const B[], 
+                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *const B[],
                  const rocblas_int ldb, const rocblas_int batch_count)
 {
   return rocsolver_getrs_batched_impl<rocblas_float_complex>(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count);
 }
 
-extern "C" ROCSOLVER_EXPORT rocsolver_status 
+extern "C" ROCSOLVER_EXPORT rocsolver_status
 rocsolver_zgetrs_batched(
                  rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, rocblas_double_complex *const A[], const rocblas_int lda,
-                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *const B[], 
+                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *const B[],
                  const rocblas_int ldb, const rocblas_int batch_count)
 {
   return rocsolver_getrs_batched_impl<rocblas_double_complex>(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count);
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs_strided_batched.cpp
index 49ced525..e42302d3 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_getrs_strided_batched.cpp
@@ -7,14 +7,14 @@
 template <typename T, typename U>
 rocblas_status rocsolver_getrs_strided_batched_impl(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, U A, const rocblas_int lda, const rocblas_int strideA,
-                 const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) 
+                 const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count)
 {
     if(!handle)
         return rocblas_status_invalid_handle;
 
-    //logging is missing ???    
+    //logging is missing ???
 
-    if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0) 
+    if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0)
         return rocblas_status_invalid_size;
 
     if (!A || !ipiv || !B)
@@ -40,7 +40,7 @@ rocblas_status rocsolver_getrs_strided_batched_impl(rocblas_handle handle, const
 extern "C" ROCSOLVER_EXPORT rocblas_status
 rocsolver_sgetrs_strided_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, float *A, const rocblas_int lda, const rocblas_int strideA,
-                 const rocblas_int *ipiv, const rocblas_int strideP, float *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) 
+                 const rocblas_int *ipiv, const rocblas_int strideP, float *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count)
 {
   return rocsolver_getrs_strided_batched_impl<float>(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count);
 }
@@ -48,26 +48,26 @@ rocsolver_sgetrs_strided_batched(rocblas_handle handle, const rocblas_operation
 extern "C" ROCSOLVER_EXPORT rocblas_status
 rocsolver_dgetrs_strided_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, double *A, const rocblas_int lda, const rocblas_int strideA,
-                 const rocblas_int *ipiv, const rocblas_int strideP, double *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) 
+                 const rocblas_int *ipiv, const rocblas_int strideP, double *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count)
 {
   return rocsolver_getrs_strided_batched_impl<double>(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count);
 }
 
-extern "C" ROCSOLVER_EXPORT rocsolver_status 
+extern "C" ROCSOLVER_EXPORT rocsolver_status
 rocsolver_cgetrs_strided_batched(
                  rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, rocblas_float_complex *A, const rocblas_int lda, const rocblas_int strideA,
-                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *B, const rocblas_int ldb, 
+                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *B, const rocblas_int ldb,
                  const rocblas_int strideB, const rocblas_int batch_count)
 {
   return rocsolver_getrs_strided_batched_impl<rocblas_float_complex>(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count);
 }
 
-extern "C" ROCSOLVER_EXPORT rocsolver_status 
+extern "C" ROCSOLVER_EXPORT rocsolver_status
 rocsolver_zgetrs_strided_batched(
                  rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, rocblas_double_complex *A, const rocblas_int lda, const rocblas_int strideA,
-                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *B, const rocblas_int ldb, 
+                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *B, const rocblas_int ldb,
                  const rocblas_int strideB, const rocblas_int batch_count)
 {
   return rocsolver_getrs_strided_batched_impl<rocblas_double_complex>(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count);
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2.cpp
index 1ed3f0ee..0127cbe0 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2.cpp
@@ -5,14 +5,14 @@
 #include "roclapack_potf2.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_potf2_impl(rocblas_handle handle, const rocblas_fill uplo,    
-                                    const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info) 
-{ 
+rocblas_status rocsolver_potf2_impl(rocblas_handle handle, const rocblas_fill uplo,
+                                    const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !info)
         return rocblas_status_invalid_pointer;
     if (n < 0 || lda < n)
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2.hpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2.hpp
index 4e1c3c91..518d202e 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2.hpp
@@ -18,9 +18,9 @@
 #include "common_device.hpp"
 #include "ideal_sizes.hpp"
 
-template <typename T, typename U> 
-__global__ void sqrtDiagOnward(U A, const rocblas_int shiftA, const rocblas_int strideA, const size_t loc, 
-                               const rocblas_int j, T *res, rocblas_int *info) 
+template <typename T, typename U>
+__global__ void sqrtDiagOnward(U A, const rocblas_int shiftA, const rocblas_int strideA, const size_t loc,
+                               const rocblas_int j, T *res, rocblas_int *info)
 {
     int id = hipBlockIdx_x;
 
@@ -45,10 +45,10 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle,
                                         const rocblas_fill uplo, const rocblas_int n, U A,
                                         const rocblas_int shiftA,
                                         const rocblas_int lda, const rocblas_int strideA,
-                                        rocblas_int *info, const rocblas_int batch_count) 
+                                        rocblas_int *info, const rocblas_int batch_count)
 {
     // quick return
-    if (n == 0 || batch_count == 0) 
+    if (n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     #ifdef batched
@@ -70,7 +70,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle,
     hipMemcpy(d_minone, &h_minone, sizeof(T), hipMemcpyHostToDevice);
 
     //diagonal info in device (device memory workspace to avoid synchronization with CPU)
-    T *pivotGPU; 
+    T *pivotGPU;
     hipMalloc(&pivotGPU, sizeof(T)*batch_count);
 
     hipStream_t stream;
@@ -95,7 +95,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle,
                 rocblas_dot<T>(handle, j, (M + idx2D(0, j, lda)), 1,
                                 (M + idx2D(0, j, lda)), 1, (pivotGPU + b));
             }
-            hipLaunchKernelGGL(sqrtDiagOnward<T>, dim3(batch_count), dim3(1), 0, stream, 
+            hipLaunchKernelGGL(sqrtDiagOnward<T>, dim3(batch_count), dim3(1), 0, stream,
                                A, shiftA, strideA, idx2D(j, j, lda), j, pivotGPU, info);
 
             // Compute elements J+1:N of row J
@@ -103,9 +103,9 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle,
                 for (int b=0;b<batch_count;++b) {
                     M = load_ptr_batch<T>(AA,shiftA,b,strideA);
                     rocblas_gemv<T>(handle, rocblas_operation_transpose, j, n - j - 1,
-                                    d_minone, (M + idx2D(0, j + 1, lda)), lda, 
+                                    d_minone, (M + idx2D(0, j + 1, lda)), lda,
                                     (M + idx2D(0, j, lda)), 1, d_one, (M + idx2D(j, j + 1, lda)), lda);
-                }    
+                }
                 for (int b=0;b<batch_count;++b) {
                     M = load_ptr_batch<T>(AA,shiftA,b,strideA);
                     rocblas_scal<T>(handle, n - j - 1, (pivotGPU + b),
@@ -122,7 +122,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle,
                 rocblas_dot<T>(handle, j, (M + idx2D(j, 0, lda)), lda,
                                 (M + idx2D(j, 0, lda)), lda, (pivotGPU + b));
             }
-            hipLaunchKernelGGL(sqrtDiagOnward<T>, dim3(batch_count), dim3(1), 0, stream, 
+            hipLaunchKernelGGL(sqrtDiagOnward<T>, dim3(batch_count), dim3(1), 0, stream,
                                A, shiftA, strideA, idx2D(j, j, lda), j, pivotGPU, info);
 
             // Compute elements J+1:N of row J
@@ -130,7 +130,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle,
                 for (int b=0;b<batch_count;++b) {
                     M = load_ptr_batch<T>(AA,shiftA,b,strideA);
                     rocblas_gemv<T>(handle, rocblas_operation_none, n - j - 1, j,
-                                    d_minone, (M + idx2D(j + 1, 0, lda)), lda, 
+                                    d_minone, (M + idx2D(j + 1, 0, lda)), lda,
                                     (M + idx2D(j, 0, lda)), lda, d_one, (M + idx2D(j + 1, j, lda)), 1);
                 }
                 for (int b=0;b<batch_count;++b) {
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2_batched.cpp
index 266cfa6a..84c16595 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2_batched.cpp
@@ -6,15 +6,15 @@
 #include "roclapack_potf2.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_potf2_batched_impl(rocblas_handle handle, const rocblas_fill uplo,    
-                                            const rocblas_int n, U A, const rocblas_int lda, 
-                                            rocblas_int* info, const rocblas_int batch_count) 
-{ 
+rocblas_status rocsolver_potf2_batched_impl(rocblas_handle handle, const rocblas_fill uplo,
+                                            const rocblas_int n, U A, const rocblas_int lda,
+                                            rocblas_int* info, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !info)
         return rocblas_status_invalid_pointer;
     if (n < 0 || lda < n || batch_count < 0)
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2_strided_batched.cpp
index 4988f364..4e88e448 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potf2_strided_batched.cpp
@@ -5,15 +5,15 @@
 #include "roclapack_potf2.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_potf2_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo,    
-                                            const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, 
-                                            rocblas_int* info, const rocblas_int batch_count) 
-{ 
+rocblas_status rocsolver_potf2_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo,
+                                            const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
+                                            rocblas_int* info, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !info)
         return rocblas_status_invalid_pointer;
     if (n < 0 || lda < n || batch_count < 0)
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf.cpp
index e0512eed..b8be605f 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf.cpp
@@ -5,14 +5,14 @@
 #include "roclapack_potrf.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_potrf_impl(rocblas_handle handle, const rocblas_fill uplo,    
-                                    const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info) 
-{ 
+rocblas_status rocsolver_potrf_impl(rocblas_handle handle, const rocblas_fill uplo,
+                                    const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !info)
         return rocblas_status_invalid_pointer;
     if (n < 0 || lda < n)
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf.hpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf.hpp
index 1f1c6650..aef657d4 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf.hpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf.hpp
@@ -19,12 +19,12 @@
 #include "ideal_sizes.hpp"
 #include "roclapack_potf2.hpp"
 
-inline __global__ void chk_positive(rocblas_int *iinfo, rocblas_int *info, int j) 
+inline __global__ void chk_positive(rocblas_int *iinfo, rocblas_int *info, int j)
 {
     int id = hipBlockIdx_x;
 
     if (info[id] == 0 && iinfo[id] > 0)
-            info[id] = iinfo[id] + j;   
+            info[id] = iinfo[id] + j;
 }
 
 template <typename T, typename U>
@@ -32,14 +32,14 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle,
                                         const rocblas_fill uplo, const rocblas_int n, U A,
                                         const rocblas_int shiftA,
                                         const rocblas_int lda, const rocblas_int strideA,
-                                        rocblas_int *info, const rocblas_int batch_count) 
+                                        rocblas_int *info, const rocblas_int batch_count)
 {
     // quick return
-    if (n == 0 || batch_count == 0) 
+    if (n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm
-    if (n < POTRF_POTF2_SWITCHSIZE) 
+    if (n < POTRF_POTF2_SWITCHSIZE)
         return rocsolver_potf2_template<T>(handle, uplo, n, A, shiftA, lda, strideA, info, batch_count);
 
     #ifdef batched
@@ -61,7 +61,7 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle,
     hipMemcpy(d_minone, &h_minone, sizeof(T), hipMemcpyHostToDevice);
 
     //info in device (device memory workspace to avoid synchronization with CPU)
-    rocblas_int *iinfo; 
+    rocblas_int *iinfo;
     hipMalloc(&iinfo, sizeof(rocblas_int)*batch_count);
 
     hipStream_t stream;
@@ -81,14 +81,14 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle,
 
     if (uplo == rocblas_fill_upper) { // Compute the Cholesky factorization A = U'*U.
         for (rocblas_int j = 0; j < n; j += POTRF_POTF2_SWITCHSIZE) {
-            // Factor diagonal and subdiagonal blocks 
+            // Factor diagonal and subdiagonal blocks
             jb = min(n - j, POTRF_POTF2_SWITCHSIZE);  //number of columns in the block
             hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,iinfo,batch_count,0);
             rocsolver_potf2_template<T>(handle, uplo, jb, A, shiftA + idx2D(j, j, lda), lda, strideA, iinfo, batch_count);
-            
+
             // test for non-positive-definiteness.
             hipLaunchKernelGGL(chk_positive,gridReset,threads,0,stream,iinfo,info,j);
-            
+
             if (j + jb < n) {
                 // update trailing submatrix
                 for (int b=0;b<batch_count;++b) {
@@ -98,7 +98,7 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle,
                              (M + idx2D(j, j, lda)), lda, (M + idx2D(j, j + jb, lda)), lda);
                 }
 
-                // *** GEMM MUST BE REPLACED BY SYRK ONCE IT IS AVAILABLE IN ROCBLAS ****                
+                // *** GEMM MUST BE REPLACED BY SYRK ONCE IT IS AVAILABLE IN ROCBLAS ****
                 for (int b=0;b<batch_count;++b) {
                     M = load_ptr_batch<T>(AA,shiftA,b,strideA);
                     rocblas_gemm(handle, rocblas_operation_transpose, rocblas_operation_none,
@@ -112,14 +112,14 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle,
 
     } else { // Compute the Cholesky factorization A = L'*L.
         for (rocblas_int j = 0; j < n; j += POTRF_POTF2_SWITCHSIZE) {
-            // Factor diagonal and subdiagonal blocks 
+            // Factor diagonal and subdiagonal blocks
             jb = min(n - j, POTRF_POTF2_SWITCHSIZE);  //number of columns in the block
             hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,iinfo,batch_count,0);
             rocsolver_potf2_template<T>(handle, uplo, jb, A, shiftA + idx2D(j, j, lda), lda, strideA, iinfo, batch_count);
-            
+
             // test for non-positive-definiteness.
             hipLaunchKernelGGL(chk_positive,gridReset,threads,0,stream,iinfo,info,j);
-            
+
             if (j + jb < n) {
                 // update trailing submatrix
                 for (int b=0;b<batch_count;++b) {
@@ -129,7 +129,7 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle,
                              (M + idx2D(j, j, lda)), lda, (M + idx2D(j + jb, j, lda)), lda);
                 }
 
-                // *** GEMM MUST BE REPLACED BY SYRK ONCE IT IS AVAILABLE IN ROCBLAS ****                
+                // *** GEMM MUST BE REPLACED BY SYRK ONCE IT IS AVAILABLE IN ROCBLAS ****
                 for (int b=0;b<batch_count;++b) {
                     M = load_ptr_batch<T>(AA,shiftA,b,strideA);
                     rocblas_gemm(handle, rocblas_operation_none, rocblas_operation_transpose,
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf_batched.cpp
index 7ac5061e..06dda30c 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf_batched.cpp
@@ -6,15 +6,15 @@
 #include "roclapack_potrf.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_potrf_batched_impl(rocblas_handle handle, const rocblas_fill uplo,    
-                                            const rocblas_int n, U A, const rocblas_int lda, 
-                                            rocblas_int* info, const rocblas_int batch_count) 
-{ 
+rocblas_status rocsolver_potrf_batched_impl(rocblas_handle handle, const rocblas_fill uplo,
+                                            const rocblas_int n, U A, const rocblas_int lda,
+                                            rocblas_int* info, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !info)
         return rocblas_status_invalid_pointer;
     if (n < 0 || lda < n || batch_count < 0)
diff --git a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf_strided_batched.cpp
index 2e49ab4b..6c081fc4 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/library/src/lapack/roclapack_potrf_strided_batched.cpp
@@ -5,15 +5,15 @@
 #include "roclapack_potrf.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_potrf_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo,    
-                                            const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, 
-                                            rocblas_int* info, const rocblas_int batch_count) 
-{ 
+rocblas_status rocsolver_potrf_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo,
+                                            const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
+                                            rocblas_int* info, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !info)
         return rocblas_status_invalid_pointer;
     if (n < 0 || lda < n || batch_count < 0)
diff --git a/ROCm_Libraries/rocSOLVER/library/src/rocsolver-config.cmake.in b/ROCm_Libraries/rocSOLVER/library/src/rocsolver-config.cmake.in
index 970adc43..8b6304e0 100644
--- a/ROCm_Libraries/rocSOLVER/library/src/rocsolver-config.cmake.in
+++ b/ROCm_Libraries/rocSOLVER/library/src/rocsolver-config.cmake.in
@@ -1,6 +1,6 @@
 
 @PACKAGE_INIT@
-    
+
 set_and_check(rocsolver_INCLUDE_DIR @PACKAGE_INCLUDE_INSTALL_DIR@)
 
 set_and_check(rocsolver_INCLUDE_DIRS @PACKAGE_INCLUDE_INSTALL_DIR@)
diff --git a/ROCm_Libraries/rocSOLVER/src/CMakeLists.txt b/ROCm_Libraries/rocSOLVER/src/CMakeLists.txt
index cbf3d10d..4a435950 100644
--- a/ROCm_Libraries/rocSOLVER/src/CMakeLists.txt
+++ b/ROCm_Libraries/rocSOLVER/src/CMakeLists.txt
@@ -82,7 +82,7 @@ add_library( rocsolver
   ${rocsolver_lapack_source}
   ${relative_rocsolver_headers_public}
   ${rocsolver_auxiliary_source}
-  ${rocsolver_common_source}  
+  ${rocsolver_common_source}
 )
 
 add_library( roc::rocsolver ALIAS rocsolver )
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larf.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larf.cpp
index 9c52fd62..8c4e0c70 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larf.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larf.cpp
@@ -5,7 +5,7 @@
 #include "rocauxiliary_larf.hpp"
 
 template <typename T>
-rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, 
+rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m,
                                    const rocsolver_int n, T* x, const rocsolver_int incx, const T* alpha,
                                    T* A, const rocsolver_int lda)
 {
@@ -24,7 +24,7 @@ rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side
     rocblas_int stridep = 0;
     rocblas_int batch_count=1;
 
-    return rocsolver_larf_template<T>(handle,side, 
+    return rocsolver_larf_template<T>(handle,side,
                                       m,n,
                                       x,0,    //vector shifted 0 entries
                                       incx,
@@ -33,7 +33,7 @@ rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side
                                       stridep,
                                       A,0,       //matrix shifted 0 entries
                                       lda,
-                                      stridea, 
+                                      stridea,
                                       batch_count);
 }
 
@@ -46,14 +46,14 @@ rocblas_status rocsolver_larf_impl(rocsolver_handle handle, const rocsolver_side
 
 extern "C" {
 
-ROCSOLVER_EXPORT rocblas_status rocsolver_slarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, 
+ROCSOLVER_EXPORT rocblas_status rocsolver_slarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m,
                                                 const rocsolver_int n, float* x, const rocsolver_int incx, const float* alpha,
                                                 float* A, const rocsolver_int lda)
 {
     return rocsolver_larf_impl<float>(handle, side, m, n, x, incx, alpha, A, lda);
 }
 
-ROCSOLVER_EXPORT rocblas_status rocsolver_dlarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m, 
+ROCSOLVER_EXPORT rocblas_status rocsolver_dlarf(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m,
                                                 const rocsolver_int n, double* x, const rocsolver_int incx, const double* alpha,
                                                 double* A, const rocsolver_int lda)
 {
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larf.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larf.hpp
index 27a5a0d4..3755ea14 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larf.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larf.hpp
@@ -19,8 +19,8 @@
 
 template <typename T, typename U>
 rocblas_status rocsolver_larf_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_int m,
-                                        const rocsolver_int n, U x, const rocblas_int shiftx, const rocsolver_int incx, 
-                                        const rocblas_int stridex, const T* alpha, const rocblas_int stridep, U A, const rocblas_int shiftA, 
+                                        const rocsolver_int n, U x, const rocblas_int shiftx, const rocsolver_int incx,
+                                        const rocblas_int stridex, const T* alpha, const rocblas_int stridep, U A, const rocblas_int shiftA,
                                         const rocsolver_int lda, const rocblas_int stridea, const rocblas_int batch_count)
 {
     // quick return
@@ -40,7 +40,7 @@ rocblas_status rocsolver_larf_template(rocsolver_handle handle, const rocsolver_
     T* zeroInt;                 //constant 0 in device
     hipMalloc(&zeroInt, sizeof(T));
     hipMemcpy(zeroInt, &zero, sizeof(T), hipMemcpyHostToDevice);
-    
+
     #ifdef batched
         // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL
         //      BATCH-BLAS FUNCTIONALITY IS ENABLED. ****
@@ -66,16 +66,16 @@ rocblas_status rocsolver_larf_template(rocsolver_handle handle, const rocsolver_
     //      OF A AND X, AS THIS WOULD REQUIRE SYNCHRONIZATION WITH GPU.
     //      IT WILL WORK ON THE ENTIRE MATRIX/VECTOR REGARDLESS OF
     //      ZERO ENTRIES ****
- 
+
     //memory in GPU (workspace)
     T *workvec;
     hipMalloc(&workvec, sizeof(T)*order*batch_count);
 
-    
+
     // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS
     //      FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
-    
+
     //compute the matrix vector product  (W=tau*A'*X or W=tau*A*X)
     for (int b=0;b<batch_count;++b) {
         xp = load_ptr_batch<T>(xx,shiftx,b,stridex);
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfb.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfb.cpp
index 12ed4e92..d28b4a03 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfb.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfb.cpp
@@ -5,10 +5,10 @@
 #include "rocauxiliary_larfb.hpp"
 
 template <typename T>
-rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_side side, 
-                                    const rocsolver_operation trans, const rocsolver_direct direct, 
+rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_side side,
+                                    const rocsolver_operation trans, const rocsolver_direct direct,
                                     const rocsolver_storev storev,
-                                    const rocsolver_int m, const rocsolver_int n, 
+                                    const rocsolver_int m, const rocsolver_int n,
                                     const rocsolver_int k, T* V, const rocsolver_int ldv, T* F, const rocsolver_int ldf,
                                     T* A, const rocsolver_int lda)
 {
@@ -22,7 +22,7 @@ rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_sid
     if (storev == rocsolver_row_wise) {
         if (ldv < k)
             return rocblas_status_invalid_size;
-    } else {    
+    } else {
         if ((side == rocblas_side_left && ldv < m) || (side == rocblas_side_right && ldv < n))
             return rocblas_status_invalid_size;
     }
@@ -34,7 +34,7 @@ rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_sid
     rocblas_int stridef = 0;
     rocblas_int batch_count=1;
 
-    return rocsolver_larfb_template<T>(handle,side,trans,direct,storev, 
+    return rocsolver_larfb_template<T>(handle,side,trans,direct,storev,
                                       m,n,k,
                                       V,0,      //shifted 0 entries
                                       ldv,
@@ -44,7 +44,7 @@ rocblas_status rocsolver_larfb_impl(rocsolver_handle handle, const rocsolver_sid
                                       stridef,
                                       A,0,      //shifted 0 entries
                                       lda,
-                                      stridea, 
+                                      stridea,
                                       batch_count);
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfb.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfb.hpp
index 5214e29a..dc4ee469 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfb.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfb.hpp
@@ -19,7 +19,7 @@
 
 
 template <typename T, typename U>
-__global__ void copymatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work) 
+__global__ void copymatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work)
 {
     const auto blocksizex = hipBlockDim_x;
     const auto blocksizey = hipBlockDim_y;
@@ -38,7 +38,7 @@ __global__ void copymatA1(const rocsolver_int ldw, const rocsolver_int order, U
 }
 
 template <typename T, typename U>
-__global__ void addmatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work) 
+__global__ void addmatA1(const rocsolver_int ldw, const rocsolver_int order, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, T* work)
 {
     const auto blocksizex = hipBlockDim_x;
     const auto blocksizey = hipBlockDim_y;
@@ -52,18 +52,18 @@ __global__ void addmatA1(const rocsolver_int ldw, const rocsolver_int order, U A
         Wp = work + b*strideW;
         Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
 
-        Ap[i + j*lda] -= Wp[i + j*ldw];    
+        Ap[i + j*lda] -= Wp[i + j*ldw];
     }
 }
 
 template <typename T, typename U>
-rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver_side side, 
-                                        const rocsolver_operation trans, const rocsolver_direct direct, 
+rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver_side side,
+                                        const rocsolver_operation trans, const rocsolver_direct direct,
                                         const rocsolver_storev storev,
                                         const rocsolver_int m, const rocsolver_int n,
-                                        const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv, 
+                                        const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv,
                                         const rocsolver_int strideV, T *F, const rocsolver_int shiftF,
-                                        const rocsolver_int ldf, const rocsolver_int strideF, 
+                                        const rocsolver_int ldf, const rocsolver_int strideF,
                                         U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA,
                                         const rocsolver_int batch_count)
 {
@@ -100,14 +100,14 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
 
     //determine the side, size of workspace
     //and whether V is trapezoidal
-    rocsolver_operation transp; 
+    rocsolver_operation transp;
     rocsolver_fill uploV;
     bool trap;
     rocblas_int order, ldw;
-    bool colwise = (storev == rocsolver_column_wise); 
+    bool colwise = (storev == rocsolver_column_wise);
     bool leftside = (side == rocblas_side_left);
     size_t offsetV;
-    
+
     if (leftside) {
         order = n;
         ldw = k;
@@ -120,16 +120,16 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
     if (colwise) {
         uploV = rocblas_fill_lower;
         offsetV = idx2D(k,0,ldv);
-        if (leftside) 
+        if (leftside)
             transp = rocblas_operation_transpose;
-        else 
+        else
             transp = rocblas_operation_none;
     } else {
         uploV = rocblas_fill_upper;
         offsetV = idx2D(0,k,ldv);
-        if (leftside) 
+        if (leftside)
             transp = rocblas_operation_none;
-        else 
+        else
             transp = rocblas_operation_transpose;
     }
 
@@ -146,15 +146,15 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
     rocblas_int blocksx = (order - 1)/32 + 1;
     rocblas_int blocksy = (ldw - 1)/32 + 1;
     hipLaunchKernelGGL(copymatA1,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,ldw,order,A,shiftA,lda,strideA,work);
-    
+
     // BACKWARD DIRECTION TO BE IMPLEMENTED...
     rocsolver_fill uploT = rocblas_fill_upper;
     if (direct == rocsolver_backward_direction)
         return rocblas_status_not_implemented;
-    
+
     //compute:
     // V1' * A1, or
-    //   or 
+    //   or
     // A1 * V1
     for (int b=0;b<batch_count;++b) {
         Vp = load_ptr_batch<T>(VV,shiftV,b,strideV);
@@ -162,14 +162,14 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
     }
 
     // compute:
-    // V1' * A1 + V2' * A2 
-    //        or 
+    // V1' * A1 + V2' * A2
+    //        or
     // A1 * V1 + A2 * V2
-    if (trap) { 
+    if (trap) {
         for (int b=0;b<batch_count;++b) {
             Ap = load_ptr_batch<T>(AA,shiftA,b,strideA);
             Vp = load_ptr_batch<T>(VV,shiftV,b,strideV);
-            if (leftside) { 
+            if (leftside) {
                 rocblas_gemm(handle,transp,rocblas_operation_none,ldw,order,m-k,oneInt,
                              (Vp + offsetV),ldv,
                              (Ap + idx2D(k,0,lda)),lda,
@@ -183,10 +183,10 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
         }
     }
 
-    // compute: 
+    // compute:
     // trans(T) * (V1' * A1 + V2' * A2)
     //              or
-    // (A1 * V1 + A2 * V2) * trans(T)    
+    // (A1 * V1 + A2 * V2) * trans(T)
     for (int b=0;b<batch_count;++b) {
         Fp = load_ptr_batch<T>(FF,shiftF,b,strideF);
         rocblas_trmm(handle,side,uploT,trans,rocblas_diagonal_non_unit,ldw,order,oneInt,Fp,ldf,(work + b*strideW),ldw);
@@ -195,7 +195,7 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
     // compute:
     // A2 - V2 * trans(T) * (V1' * A1 + V2' * A2)
     //              or
-    // A2 - (A1 * V1 + A2 * V2) * trans(T) * V2'    
+    // A2 - (A1 * V1 + A2 * V2) * trans(T) * V2'
     if (transp == rocblas_operation_transpose)
         transp = rocblas_operation_none;
     else
@@ -205,7 +205,7 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
         for (int b=0;b<batch_count;++b) {
             Ap = load_ptr_batch<T>(AA,shiftA,b,strideA);
             Vp = load_ptr_batch<T>(VV,shiftV,b,strideV);
-            if (leftside) { 
+            if (leftside) {
                 rocblas_gemm(handle,transp,rocblas_operation_none,m-k,order,ldw,minoneInt,
                              (Vp + offsetV),ldv,
                              (work + b*strideW),ldw,
@@ -218,22 +218,22 @@ rocblas_status rocsolver_larfb_template(rocsolver_handle handle, const rocsolver
             }
         }
     }
-        
+
     // compute:
     // V1 * trans(T) * (V1' * A1 + V2' * A2)
     //              or
-    // (A1 * V1 + A2 * V2) * trans(T) * V1'    
+    // (A1 * V1 + A2 * V2) * trans(T) * V1'
     for (int b=0;b<batch_count;++b) {
         Vp = load_ptr_batch<T>(VV,shiftV,b,strideV);
         rocblas_trmm(handle,side,uploV,transp,rocblas_diagonal_unit,ldw,order,oneInt,Vp,ldv,(work + b*strideW),ldw);
     }
-    
+
     // compute:
     // A1 - V1 * trans(T) * (V1' * A1 + V2' * A2)
     //              or
     // A1 - (A1 * V1 + A2 * V2) * trans(T) * V1'
     hipLaunchKernelGGL(addmatA1,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,ldw,order,A,shiftA,lda,strideA,work);
-    
+
     hipFree(minoneInt);
     hipFree(oneInt);
     hipFree(work);
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfg.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfg.cpp
index 4b1e00fa..8e651066 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfg.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfg.cpp
@@ -26,7 +26,7 @@ rocblas_status rocsolver_larfg_impl(rocblas_handle handle, const rocblas_int n,
                                         incx,
                                         stridex,
                                         tau,
-                                        strideP, 
+                                        strideP,
                                         batch_count);
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfg.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfg.hpp
index f4fc193c..38683f5d 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfg.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larfg.hpp
@@ -42,7 +42,7 @@ __global__ void set_taubeta(T *tau, const rocblas_int strideP, T *norms, U alpha
 
 
 template <typename T, typename U>
-rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int n, U alpha, const rocblas_int shifta, 
+rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int n, U alpha, const rocblas_int shifta,
                                         U x, const rocblas_int shiftx, const rocblas_int incx, const rocblas_int stridex,
                                         T *tau, const rocblas_int strideP, const rocblas_int batch_count)
 {
@@ -54,11 +54,11 @@ rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
     dim3 gridReset(1, batch_count, 1);
-    dim3 threads(1, 1, 1); 
+    dim3 threads(1, 1, 1);
     if (n == 1) {
         hipLaunchKernelGGL(reset_batch_info,gridReset,threads,0,stream,tau,strideP,1,0);
-        return rocblas_status_success;    
-    } 
+        return rocblas_status_success;
+    }
 
     T *xp;
 
@@ -73,12 +73,12 @@ rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int
 
     //memory in GPU (workspace)
     T *norms;
-    hipMalloc(&norms, sizeof(T)*batch_count);    
+    hipMalloc(&norms, sizeof(T)*batch_count);
 
     // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS
     //      FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
-    
+
     //compute norm of x
     for (int b=0;b<batch_count;++b) {
         xp = load_ptr_batch<T>(xx,shiftx,b,stridex);
@@ -87,9 +87,9 @@ rocblas_status rocsolver_larfg_template(rocblas_handle handle, const rocblas_int
 
     //set value of tau and beta and scalling factor for vector x
     //alpha <- beta
-    //norms <- scalling   
+    //norms <- scalling
     hipLaunchKernelGGL(set_taubeta<T>,dim3(batch_count),dim3(1),0,stream,tau,strideP,norms,alpha,shifta,stridex);
-     
+
     //compute vector v=x*norms
     for (int b=0;b<batch_count;++b) {
         xp = load_ptr_batch<T>(xx,shiftx,b,stridex);
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larft.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larft.cpp
index 5ab79a92..10915015 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larft.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larft.cpp
@@ -5,8 +5,8 @@
 #include "rocauxiliary_larft.hpp"
 
 template <typename T>
-rocblas_status rocsolver_larft_impl(rocsolver_handle handle, const rocsolver_direct direct, 
-                                   const rocsolver_storev storev, const rocsolver_int n, 
+rocblas_status rocsolver_larft_impl(rocsolver_handle handle, const rocsolver_direct direct,
+                                   const rocsolver_storev storev, const rocsolver_int n,
                                    const rocsolver_int k, T* V, const rocsolver_int ldv, T* tau,
                                    T* F, const rocsolver_int ldf)
 {
@@ -38,7 +38,7 @@ rocblas_status rocsolver_larft_impl(rocsolver_handle handle, const rocsolver_dir
                                       stridet,
                                       F,
                                       ldf,
-                                      stridef, 
+                                      stridef,
                                       batch_count);
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larft.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larft.hpp
index ee2add09..8a38ac3f 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larft.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_larft.hpp
@@ -17,8 +17,8 @@
 #include "common_device.hpp"
 
 template <typename T, typename U>
-__global__ void set_triangular(const rocsolver_int k, U V, const rocsolver_int shiftV, const rocsolver_int ldv, const rocsolver_int strideV, 
-                         T* tau, const rocsolver_int strideT, 
+__global__ void set_triangular(const rocsolver_int k, U V, const rocsolver_int shiftV, const rocsolver_int ldv, const rocsolver_int strideV,
+                         T* tau, const rocsolver_int strideT,
                          T* F, const rocsolver_int ldf, const rocsolver_int strideF, const rocsolver_storev storev)
 {
     const auto blocksize = hipBlockDim_x;
@@ -51,20 +51,20 @@ __global__ void set_tau(const rocsolver_int k, T* tau, const rocsolver_int strid
     const auto blocksize = hipBlockDim_x;
     const auto b = hipBlockIdx_x;
     const auto i = hipBlockIdx_y * blocksize + hipThreadIdx_x;
-   
+
     if (i < k) {
         T *tp;
         tp = tau + b*strideT;
         tp[i] = -tp[i];
     }
 }
-         
+
 
 template <typename T, typename U>
-rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver_direct direct, 
+rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver_direct direct,
                                    const rocsolver_storev storev, const rocsolver_int n,
-                                   const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv, 
-                                   const rocsolver_int strideV, T* tau, const rocsolver_int strideT, T* F, 
+                                   const rocsolver_int k, U V, const rocblas_int shiftV, const rocsolver_int ldv,
+                                   const rocsolver_int strideV, T* tau, const rocsolver_int strideT, T* F,
                                    const rocsolver_int ldf, const rocsolver_int strideF, const rocsolver_int batch_count)
 {
     // quick return
@@ -84,7 +84,7 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver
     hipMemcpy(oneInt, &one, sizeof(T), hipMemcpyHostToDevice);
     hipMalloc(&zeroInt, sizeof(T));
     hipMemcpy(zeroInt, &zero, sizeof(T), hipMemcpyHostToDevice);
-    
+
     #ifdef batched
         // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL
         //      BATCH-BLAS FUNCTIONALITY IS ENABLED. ****
@@ -98,26 +98,26 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver
     if (direct == rocsolver_backward_direction)
         return rocblas_status_not_implemented;
 
-    //Fix diagonal of T, make zero the non used triangular part, 
+    //Fix diagonal of T, make zero the non used triangular part,
     //setup tau (changing signs) and account for the non-stored 1's on the householder vectors
     rocblas_int blocks = (k - 1)/32 + 1;
     hipLaunchKernelGGL(set_triangular,dim3(blocks,blocks,batch_count),dim3(32,32),0,stream,
                         k,V,shiftV,ldv,strideV,tau,strideT,F,ldf,strideF,storev);
     hipLaunchKernelGGL(set_tau,dim3(batch_count,blocks),dim3(32,1),0,stream,k,tau,strideT);
 
-    // **** FOR NOW, IT DOES NOT LOOK FOR TRAILING ZEROS 
+    // **** FOR NOW, IT DOES NOT LOOK FOR TRAILING ZEROS
     //      AS THIS WOULD REQUIRE SYNCHRONIZATION WITH GPU.
     //      IT WILL WORK ON THE ENTIRE MATRIX/VECTOR REGARDLESS OF
     //      ZERO ENTRIES ****
- 
+
     // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS
     //      FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
-    
-    rocblas_operation trans;  
 
-    
-    for (int i = 1; i < k; ++i) { 
+    rocblas_operation trans;
+
+
+    for (int i = 1; i < k; ++i) {
         //compute the matrix vector product, using the householder vectors
         for (int b=0;b<batch_count;++b) {
             tp = tau + b*strideT;
@@ -137,13 +137,13 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver
         //multiply by the previous triangular factor
         //THIS SHOULD BE DONE USING TRMV ONCE THIS
         //FUNCTIONALITY IS AVAILABLE IN ROCBLAS
-        trans = rocblas_operation_none; 
+        trans = rocblas_operation_none;
         for (int b=0;b<batch_count;++b) {
             Vp = load_ptr_batch<T>(VV,shiftV,b,strideV);
             Fp = F + b*strideF;
-            rocblas_gemv(handle, trans, i, i, oneInt, Fp, ldf, 
+            rocblas_gemv(handle, trans, i, i, oneInt, Fp, ldf,
                         (Fp + idx2D(0,i,ldf)), 1, zeroInt, (Fp + idx2D(0,i,ldf)), 1);
-        } 
+        }
     }
 
     //restore tau
@@ -151,7 +151,7 @@ rocblas_status rocsolver_larft_template(rocsolver_handle handle, const rocsolver
 
     hipFree(oneInt);
     hipFree(zeroInt);
-    
+
     return rocblas_status_success;
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_laswp.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_laswp.cpp
index e79f652f..360fef79 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_laswp.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_laswp.cpp
@@ -54,14 +54,14 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_dlaswp(rocsolver_handle handle, const
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_claswp(rocsolver_handle handle, const rocsolver_int n,
-                 rocblas_float_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2, 
+                 rocblas_float_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2,
                  const rocsolver_int *ipiv, const rocblas_int incx)
 {
     return rocsolver_laswp_impl<rocblas_float_complex>(handle, n, A, lda, k1, k2, ipiv, incx);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zlaswp(rocsolver_handle handle, const rocsolver_int n,
-                 rocblas_double_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2, 
+                 rocblas_double_complex *A, const rocsolver_int lda, const rocsolver_int k1, const rocsolver_int k2,
                  const rocsolver_int *ipiv, const rocblas_int incx)
 {
     return rocsolver_laswp_impl<rocblas_double_complex>(handle, n, A, lda, k1, k2, ipiv, incx);
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_laswp.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_laswp.hpp
index 0dc74205..4615a7ec 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_laswp.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_laswp.hpp
@@ -51,10 +51,10 @@ __global__ void laswp_kernel(const rocblas_int n, U AA, const rocblas_int shiftA
 template <typename T, typename U>
 rocblas_status rocsolver_laswp_template(rocblas_handle handle, const rocblas_int n, U A, const rocblas_int shiftA,
                               const rocblas_int lda, const rocblas_int strideA, const rocblas_int k1, const rocblas_int k2,
-                              const rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int incx, 
+                              const rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int incx,
                               const rocblas_int batch_count) {
     // quick return
-    if (n == 0 || !batch_count) 
+    if (n == 0 || !batch_count)
         return rocblas_status_success;
 
     rocblas_int start, end, inc;
@@ -63,7 +63,7 @@ rocblas_status rocsolver_laswp_template(rocblas_handle handle, const rocblas_int
         end = k1 - 1;
         inc = -1;
         incx = -incx;
-    } 
+    }
     else {
         start = k1;
         end = k2 + 1;
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_org2r.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_org2r.cpp
index 102fd83e..465b3635 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_org2r.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_org2r.cpp
@@ -5,7 +5,7 @@
 #include "rocauxiliary_org2r.hpp"
 
 template <typename T>
-rocblas_status rocsolver_org2r_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_org2r_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv)
 {
     if(!handle)
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_org2r.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_org2r.hpp
index 08d072aa..2dbcc11e 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_org2r.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_org2r.hpp
@@ -29,10 +29,10 @@ __global__ void init_ident_col(const rocblas_int m, const rocblas_int n, const r
 
     if (i < m && j < n) {
         T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
-        
-        if (i == j) 
+
+        if (i == j)
             Ap[i + j*lda] = 1.0;
-        else if (j > i) 
+        else if (j > i)
             Ap[i + j*lda] = 0.0;
         else if (j >= k)
             Ap[i + j*lda] = 0.0;
@@ -40,9 +40,9 @@ __global__ void init_ident_col(const rocblas_int m, const rocblas_int n, const r
 }
 
 template <typename T, typename U>
-rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver_int m, 
-                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, 
-                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver_int m,
+                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA,
+                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, const rocsolver_int batch_count)
 {
     // quick return
@@ -51,7 +51,7 @@ rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver
 
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
-    
+
     #ifdef batched
         // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL
         //      BATCH-BLAS FUNCTIONALITY IS ENABLED. ****
@@ -64,7 +64,7 @@ rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver
     // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS
     //      FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
-    
+
     T* M;
 
     // Initialize identity matrix (non used columns)
@@ -78,34 +78,34 @@ rocblas_status rocsolver_org2r_template(rocsolver_handle handle, const rocsolver
         if (j < n - 1) {
             rocsolver_larf_template(handle,rocblas_side_left,           //side
                                     m - j,                              //number of rows of matrix to modify
-                                    n - j - 1,                          //number of columns of matrix to modify    
+                                    n - j - 1,                          //number of columns of matrix to modify
                                     A, shiftA + idx2D(j,j,lda),         //householder vector x
                                     1, strideA,                         //inc of x
                                     (ipiv + j), strideP,                //householder scalar (alpha)
                                     A, shiftA + idx2D(j,j+1,lda),       //matrix to work on
                                     lda, strideA,                       //leading dimension
-                                    batch_count);          
+                                    batch_count);
         }
 
         // set the diagonal element and negative tau
         hipLaunchKernelGGL(setdiag<T>,dim3(batch_count),dim3(1),0,stream,
                             j,A,shiftA,lda,strideA,ipiv,strideP);
-        
+
         // update i-th column -corresponding to H(i)-
         if (j < m - 1) {
             for (int b=0;b<batch_count;++b) {
                 M = load_ptr_batch<T>(AA,shiftA,b,strideA);
-                rocblas_scal(handle, (m-j-1), (ipiv + b*strideP + j), 
-                            (M + idx2D(j + 1, j, lda)), 1); 
-            }          
+                rocblas_scal(handle, (m-j-1), (ipiv + b*strideP + j),
+                            (M + idx2D(j + 1, j, lda)), 1);
+            }
         }
     }
-    
+
     // restore values of tau
     blocksx = (k - 1)/128 + 1;
     hipLaunchKernelGGL(restau<T>,dim3(blocksx,batch_count),dim3(128),0,stream,
                             k,ipiv,strideP);
- 
+
     return rocblas_status_success;
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgbr.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgbr.cpp
index bd3e4714..eb4f0bb6 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgbr.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgbr.cpp
@@ -5,8 +5,8 @@
 #include "rocauxiliary_orgbr.hpp"
 
 template <typename T>
-rocblas_status rocsolver_orgbr_impl(rocsolver_handle handle, const rocsolver_storev storev, 
-                                   const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_orgbr_impl(rocsolver_handle handle, const rocsolver_storev storev,
+                                   const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv)
 {
     if(!handle)
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgbr.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgbr.hpp
index a1315b6e..deec30a8 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgbr.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgbr.hpp
@@ -23,7 +23,7 @@
 #define BS 32 //blocksize for kernels
 
 template <typename T, typename U>
-__global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, 
+__global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA,
                          T *W, const rocsolver_int shiftW, const rocsolver_int ldw, const rocsolver_int strideW)
 {
     const auto b = hipBlockIdx_z;
@@ -33,17 +33,17 @@ __global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const
     if (i < dim && j < dim && j <= i) {
         rocblas_int offset = j*(j+1)/2; //to acommodate in smaller array W
 
-        T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);    
+        T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
         T *Wp = load_ptr_batch<T>(W,shiftW,b,strideW);
-        
+
         if (copy) {
             //copy columns
-            Wp[i + j*ldw - offset] = (j == 0 ? 0.0 : Ap[i+1 + (j-1)*lda]);    
-        
+            Wp[i + j*ldw - offset] = (j == 0 ? 0.0 : Ap[i+1 + (j-1)*lda]);
+
         } else {
-            // shift columns to the right   
+            // shift columns to the right
             Ap[i+1 + j*lda] = Wp[i + j*ldw - offset];
-            
+
             // make first row the identity
             if (i == j) {
                 Ap[(j+1)*lda] = 0.0;
@@ -55,7 +55,7 @@ __global__ void copyshift_col(const bool copy, const rocblas_int dim, U A, const
 }
 
 template <typename T, typename U>
-__global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA, 
+__global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const rocsolver_int shiftA, const rocsolver_int lda, const rocsolver_int strideA,
                          T *W, const rocsolver_int shiftW, const rocsolver_int ldw, const rocsolver_int strideW)
 {
     const auto b = hipBlockIdx_z;
@@ -65,17 +65,17 @@ __global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const
     if (i < dim && j < dim && i <= j) {
         rocblas_int offset = j*ldw - j*(j+1)/2; //to acommodate in smaller array W
 
-        T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);    
+        T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
         T *Wp = load_ptr_batch<T>(W,shiftW,b,strideW);
-        
+
         if (copy) {
             //copy rows
-            Wp[i + j*ldw - offset] = (i == 0 ? 0.0 : Ap[i-1 + (j+1)*lda]);    
-        
+            Wp[i + j*ldw - offset] = (i == 0 ? 0.0 : Ap[i-1 + (j+1)*lda]);
+
         } else {
-            // shift rows downward   
+            // shift rows downward
             Ap[i + (j+1)*lda] = Wp[i + j*ldw - offset];
-            
+
             // make first column the identity
             if (i == j) {
                 Ap[i+1] = 0.0;
@@ -87,9 +87,9 @@ __global__ void copyshift_row(const bool copy, const rocblas_int dim, U A, const
 }
 
 template <typename T, typename U>
-rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver_storev storev, const rocsolver_int m, 
-                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, 
-                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver_storev storev, const rocsolver_int m,
+                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA,
+                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, const rocsolver_int batch_count)
 {
     // quick return
@@ -99,11 +99,11 @@ rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
 
-    // if column-wise, compute orthonormal columns of matrix Q in the bi-diagonalization 
+    // if column-wise, compute orthonormal columns of matrix Q in the bi-diagonalization
     // of a m-by-k matrix A (given by gebrd)
     if (storev == rocsolver_column_wise) {
         if (m >= k) {
-            rocsolver_orgqr_template<T>(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count);    
+            rocsolver_orgqr_template<T>(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count);
         } else {
             // shift the householder vectors provided by gebrd as they come below the first subdiagonal
             // workspace
@@ -115,21 +115,21 @@ rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver
             rocblas_int blocks = (m - 2)/BS + 1;
 
             // copy
-            hipLaunchKernelGGL(copyshift_col<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, 
-                                true,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW);           
+            hipLaunchKernelGGL(copyshift_col<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream,
+                                true,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW);
 
             // shift
-            hipLaunchKernelGGL(copyshift_col<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, 
-                                false,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW);           
-            
+            hipLaunchKernelGGL(copyshift_col<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream,
+                                false,m-1,A,shiftA,lda,strideA,W,0,ldw,strideW);
+
             // result
-            rocsolver_orgqr_template<T>(handle, m-1, m-1, m-1, A, shiftA + idx2D(1,1,lda), lda, strideA, ipiv, strideP, batch_count);    
-        
+            rocsolver_orgqr_template<T>(handle, m-1, m-1, m-1, A, shiftA + idx2D(1,1,lda), lda, strideA, ipiv, strideP, batch_count);
+
             hipFree(W);
-        }   
+        }
     }
-    
-    // if row-wise, compute orthonormal rowss of matrix P' in the bi-diagonalization 
+
+    // if row-wise, compute orthonormal rowss of matrix P' in the bi-diagonalization
     // of a k-by-n matrix A (given by gebrd)
     else {
         if (n > k) {
@@ -145,19 +145,19 @@ rocblas_status rocsolver_orgbr_template(rocsolver_handle handle, const rocsolver
             rocblas_int blocks = (n - 2)/BS + 1;
 
             // copy
-            hipLaunchKernelGGL(copyshift_row<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, 
-                                true,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW);           
+            hipLaunchKernelGGL(copyshift_row<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream,
+                                true,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW);
 
             // shift
-            hipLaunchKernelGGL(copyshift_row<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream, 
-                                false,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW);           
+            hipLaunchKernelGGL(copyshift_row<T>,dim3(blocks,blocks,batch_count),dim3(BS,BS),0,stream,
+                                false,n-1,A,shiftA,lda,strideA,W,0,ldw,strideW);
 
             // result
             rocsolver_orglq_template<T>(handle, n-1, n-1, n-1, A, shiftA + idx2D(1,1,lda), lda, strideA, ipiv, strideP, batch_count);
-                
+
             hipFree(W);
         }
-    }    
+    }
 
     return rocblas_status_success;
 }
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgl2.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgl2.cpp
index 27e3d8ed..ec38dc16 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgl2.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgl2.cpp
@@ -5,7 +5,7 @@
 #include "rocauxiliary_orgl2.hpp"
 
 template <typename T>
-rocblas_status rocsolver_orgl2_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_orgl2_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv)
 {
     if(!handle)
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgl2.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgl2.hpp
index 202a4fc3..35475070 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgl2.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgl2.hpp
@@ -29,10 +29,10 @@ __global__ void init_ident_row(const rocblas_int m, const rocblas_int n, const r
 
     if (i < m && j < n) {
         T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
-        
-        if (i == j) 
+
+        if (i == j)
             Ap[i + j*lda] = 1.0;
-        else if (j < i) 
+        else if (j < i)
             Ap[i + j*lda] = 0.0;
         else if (i >= k)
             Ap[i + j*lda] = 0.0;
@@ -40,9 +40,9 @@ __global__ void init_ident_row(const rocblas_int m, const rocblas_int n, const r
 }
 
 template <typename T, typename U>
-rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver_int m, 
-                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, 
-                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver_int m,
+                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA,
+                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, const rocsolver_int batch_count)
 {
     // quick return
@@ -51,7 +51,7 @@ rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver
 
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
-    
+
     #ifdef batched
         // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL
         //      BATCH-BLAS FUNCTIONALITY IS ENABLED. ****
@@ -64,7 +64,7 @@ rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver
     // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS
     //      FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
-    
+
     T* M;
 
     // Initialize identity matrix (non used columns)
@@ -78,34 +78,34 @@ rocblas_status rocsolver_orgl2_template(rocsolver_handle handle, const rocsolver
         if (j < m - 1) {
             rocsolver_larf_template(handle,rocblas_side_right,          //side
                                     m - j - 1,                          //number of rows of matrix to modify
-                                    n - j,                              //number of columns of matrix to modify    
+                                    n - j,                              //number of columns of matrix to modify
                                     A, shiftA + idx2D(j,j,lda),         //householder vector x
                                     lda, strideA,                       //inc of x
                                     (ipiv + j), strideP,                //householder scalar (alpha)
                                     A, shiftA + idx2D(j+1,j,lda),       //matrix to work on
                                     lda, strideA,                       //leading dimension
-                                    batch_count);          
+                                    batch_count);
         }
 
         // set the diagonal element and negative tau
         hipLaunchKernelGGL(setdiag<T>,dim3(batch_count),dim3(1),0,stream,
                             j,A,shiftA,lda,strideA,ipiv,strideP);
-        
+
         // update i-th row -corresponding to H(i)-
         if (j < n - 1) {
             for (int b=0;b<batch_count;++b) {
                 M = load_ptr_batch<T>(AA,shiftA,b,strideA);
-                rocblas_scal(handle, (n-j-1), (ipiv + b*strideP + j), 
-                            (M + idx2D(j, j + 1, lda)), lda); 
-            }          
+                rocblas_scal(handle, (n-j-1), (ipiv + b*strideP + j),
+                            (M + idx2D(j, j + 1, lda)), lda);
+            }
         }
     }
-    
+
     // restore values of tau
     blocksx = (k - 1)/128 + 1;
     hipLaunchKernelGGL(restau<T>,dim3(blocksx,batch_count),dim3(128),0,stream,
                             k,ipiv,strideP);
- 
+
     return rocblas_status_success;
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orglq.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orglq.cpp
index 35b17482..e3039734 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orglq.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orglq.cpp
@@ -5,7 +5,7 @@
 #include "rocauxiliary_orglq.hpp"
 
 template <typename T>
-rocblas_status rocsolver_orglq_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_orglq_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv)
 {
     if(!handle)
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orglq.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orglq.hpp
index 97886fce..39f77a46 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orglq.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orglq.hpp
@@ -32,16 +32,16 @@ __global__ void set_zero_row(const rocblas_int m, const rocblas_int kk, U A,
 
     if (i < m && j < kk) {
         T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
-        
+
         Ap[i + j*lda] = 0.0;
     }
 }
 
 
 template <typename T, typename U>
-rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver_int m, 
-                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, 
-                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver_int m,
+                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA,
+                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, const rocsolver_int batch_count)
 {
     // quick return
@@ -50,9 +50,9 @@ rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver
 
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
-    
+
     // if the matrix is small, use the unblocked variant of the algorithm
-    if (k <= GEQRF_GEQR2_SWITCHSIZE) 
+    if (k <= GEQRF_GEQR2_SWITCHSIZE)
         return rocsolver_orgl2_template<T>(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count);
 
     //memory in GPU (workspace)
@@ -64,34 +64,34 @@ rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver
     // start of first blocked block
     rocblas_int jb = GEQRF_GEQR2_BLOCKSIZE;
     rocblas_int j = ((k - GEQRF_GEQR2_SWITCHSIZE - 1) / jb) * jb;
-    
+
     // start of the unblocked block
-    rocblas_int kk = min(k, j + jb); 
+    rocblas_int kk = min(k, j + jb);
 
     rocblas_int blocksy, blocksx;
-    
-    // compute the unblockled part and set to zero the 
+
+    // compute the unblockled part and set to zero the
     // corresponding left submatrix
     if (kk < m) {
         blocksx = (m - kk - 1)/32 + 1;
         blocksy = (kk - 1)/32 + 1;
         hipLaunchKernelGGL(set_zero_row<T>,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,
                            m,kk,A,shiftA,lda,strideA);
-        
-        rocsolver_orgl2_template<T>(handle, m - kk, n - kk, k - kk, 
-                                    A, shiftA + idx2D(kk, kk, lda), lda, 
+
+        rocsolver_orgl2_template<T>(handle, m - kk, n - kk, k - kk,
+                                    A, shiftA + idx2D(kk, kk, lda), lda,
                                     strideA, (ipiv + kk), strideP, batch_count);
     }
 
     // compute the blocked part
     while (j >= 0) {
-        
+
         // first update the already computed part
         // applying the current block reflector using larft + larfb
         if (j + jb < m) {
-            rocsolver_larft_template<T>(handle, rocsolver_forward_direction, 
-                                        rocsolver_row_wise, n-j, jb, 
-                                        A, shiftA + idx2D(j,j,lda), lda, strideA, 
+            rocsolver_larft_template<T>(handle, rocsolver_forward_direction,
+                                        rocsolver_row_wise, n-j, jb,
+                                        A, shiftA + idx2D(j,j,lda), lda, strideA,
                                         (ipiv + j), strideP,
                                         work, ldw, strideW, batch_count);
 
@@ -110,13 +110,13 @@ rocblas_status rocsolver_orglq_template(rocsolver_handle handle, const rocsolver
             hipLaunchKernelGGL(set_zero_row<T>,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,
                                j+jb,j,A,shiftA,lda,strideA);
         }
-        rocsolver_orgl2_template<T>(handle, jb, n - j, jb, 
-                                    A, shiftA + idx2D(j, j, lda), lda, 
+        rocsolver_orgl2_template<T>(handle, jb, n - j, jb,
+                                    A, shiftA + idx2D(j, j, lda), lda,
                                     strideA, (ipiv + j), strideP, batch_count);
 
         j -= jb;
     }
- 
+
     hipFree(work);
 
     return rocblas_status_success;
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgqr.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgqr.cpp
index ef11bd5e..7b1aceec 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgqr.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgqr.cpp
@@ -5,7 +5,7 @@
 #include "rocauxiliary_orgqr.hpp"
 
 template <typename T>
-rocblas_status rocsolver_orgqr_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_orgqr_impl(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv)
 {
     if(!handle)
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgqr.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgqr.hpp
index 86386317..8079413c 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgqr.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orgqr.hpp
@@ -32,15 +32,15 @@ __global__ void set_zero_col(const rocblas_int n, const rocblas_int kk, U A,
 
     if (i < kk && j < n) {
         T *Ap = load_ptr_batch<T>(A,shiftA,b,strideA);
-        
+
         Ap[i + j*lda] = 0.0;
     }
 }
 
 template <typename T, typename U>
-rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver_int m, 
-                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA, 
-                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver_int m,
+                                   const rocsolver_int n, const rocsolver_int k, U A, const rocblas_int shiftA,
+                                   const rocsolver_int lda, const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, const rocsolver_int batch_count)
 {
     // quick return
@@ -49,9 +49,9 @@ rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver
 
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
-    
+
     // if the matrix is small, use the unblocked variant of the algorithm
-    if (k <= GEQRF_GEQR2_SWITCHSIZE) 
+    if (k <= GEQRF_GEQR2_SWITCHSIZE)
         return rocsolver_org2r_template<T>(handle, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, batch_count);
 
     //memory in GPU (workspace)
@@ -63,34 +63,34 @@ rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver
     // start of first blocked block
     rocblas_int jb = GEQRF_GEQR2_BLOCKSIZE;
     rocblas_int j = ((k - GEQRF_GEQR2_SWITCHSIZE - 1) / jb) * jb;
-    
+
     // start of the unblocked block
-    rocblas_int kk = min(k, j + jb); 
+    rocblas_int kk = min(k, j + jb);
 
     rocblas_int blocksy, blocksx;
-    
-    // compute the unblockled part and set to zero the 
+
+    // compute the unblockled part and set to zero the
     // corresponding top submatrix
     if (kk < n) {
         blocksx = (kk - 1)/32 + 1;
         blocksy = (n- kk - 1)/32 + 1;
         hipLaunchKernelGGL(set_zero_col<T>,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,
                            n,kk,A,shiftA,lda,strideA);
-        
-        rocsolver_org2r_template<T>(handle, m - kk, n - kk, k - kk, 
-                                    A, shiftA + idx2D(kk, kk, lda), lda, 
+
+        rocsolver_org2r_template<T>(handle, m - kk, n - kk, k - kk,
+                                    A, shiftA + idx2D(kk, kk, lda), lda,
                                     strideA, (ipiv + kk), strideP, batch_count);
     }
 
     // compute the blocked part
     while (j >= 0) {
-        
+
         // first update the already computed part
         // applying the current block reflector using larft + larfb
         if (j + jb < n) {
-            rocsolver_larft_template<T>(handle, rocsolver_forward_direction, 
-                                        rocsolver_column_wise, m-j, jb, 
-                                        A, shiftA + idx2D(j,j,lda), lda, strideA, 
+            rocsolver_larft_template<T>(handle, rocsolver_forward_direction,
+                                        rocsolver_column_wise, m-j, jb,
+                                        A, shiftA + idx2D(j,j,lda), lda, strideA,
                                         (ipiv + j), strideP,
                                         work, ldw, strideW, batch_count);
 
@@ -109,13 +109,13 @@ rocblas_status rocsolver_orgqr_template(rocsolver_handle handle, const rocsolver
             hipLaunchKernelGGL(set_zero_col<T>,dim3(blocksx,blocksy,batch_count),dim3(32,32),0,stream,
                                j+jb,j,A,shiftA,lda,strideA);
         }
-        rocsolver_org2r_template<T>(handle, m - j, jb, jb, 
-                                    A, shiftA + idx2D(j, j, lda), lda, 
+        rocsolver_org2r_template<T>(handle, m - j, jb, jb,
+                                    A, shiftA + idx2D(j, j, lda), lda,
                                     strideA, (ipiv + j), strideP, batch_count);
 
         j -= jb;
     }
- 
+
     hipFree(work);
 
     return rocblas_status_success;
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orm2r.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orm2r.cpp
index 34ee185b..fdaa1724 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orm2r.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orm2r.cpp
@@ -5,8 +5,8 @@
 #include "rocauxiliary_orm2r.hpp"
 
 template <typename T>
-rocblas_status rocsolver_orm2r_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, 
-                                   const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_orm2r_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans,
+                                   const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv, T *C, const rocsolver_int ldc)
 {
     if(!handle)
@@ -35,7 +35,7 @@ rocblas_status rocsolver_orm2r_impl(rocsolver_handle handle, const rocsolver_sid
                                       strideA,
                                       ipiv,
                                       strideP,
-                                      C,0,  
+                                      C,0,
                                       ldc,
                                       strideC,
                                       batch_count);
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orm2r.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orm2r.hpp
index 10522f08..dd83c375 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orm2r.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_orm2r.hpp
@@ -18,10 +18,10 @@
 #include "../auxiliary/rocauxiliary_larf.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, 
-                                   const rocsolver_int m, const rocsolver_int n, 
-                                   const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda, 
-                                   const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans,
+                                   const rocsolver_int m, const rocsolver_int n,
+                                   const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda,
+                                   const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, U C, const rocsolver_int shiftC, const rocsolver_int ldc,
                                    const rocsolver_int strideC, const rocsolver_int batch_count)
 {
@@ -72,14 +72,14 @@ rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver
             ncol = n - i;
             jc = i;
         }
-    
-        // insert one in A(i,i) tobuild/apply the householder matrix 
+
+        // insert one in A(i,i) tobuild/apply the householder matrix
         hipLaunchKernelGGL(set_one_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(i,i,lda),strideA);
 
-        // Apply current Householder reflector 
+        // Apply current Householder reflector
         rocsolver_larf_template(handle,side,                        //side
                                 nrow,                               //number of rows of matrix to modify
-                                ncol,                               //number of columns of matrix to modify    
+                                ncol,                               //number of columns of matrix to modify
                                 A, shiftA + idx2D(i,i,lda),         //householder vector x
                                 1, strideA,                         //inc of x
                                 (ipiv + i), strideP,                //householder scalar (alpha)
@@ -90,7 +90,7 @@ rocblas_status rocsolver_orm2r_template(rocsolver_handle handle, const rocsolver
         // restore original value of A(i,i)
         hipLaunchKernelGGL(restore_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(i,i,lda),strideA);
     }
- 
+
     return rocblas_status_success;
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_ormqr.cpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_ormqr.cpp
index 7d11d5e6..820f4a46 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_ormqr.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_ormqr.cpp
@@ -5,8 +5,8 @@
 #include "rocauxiliary_ormqr.hpp"
 
 template <typename T>
-rocblas_status rocsolver_ormqr_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, 
-                                   const rocsolver_int m, const rocsolver_int n, 
+rocblas_status rocsolver_ormqr_impl(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans,
+                                   const rocsolver_int m, const rocsolver_int n,
                                    const rocsolver_int k, T* A, const rocsolver_int lda, T* ipiv, T *C, const rocsolver_int ldc)
 {
     if(!handle)
@@ -35,7 +35,7 @@ rocblas_status rocsolver_ormqr_impl(rocsolver_handle handle, const rocsolver_sid
                                       strideA,
                                       ipiv,
                                       strideP,
-                                      C,0,  
+                                      C,0,
                                       ldc,
                                       strideC,
                                       batch_count);
diff --git a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_ormqr.hpp b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_ormqr.hpp
index fd0b523c..b24d77cd 100644
--- a/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_ormqr.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/auxiliary/rocauxiliary_ormqr.hpp
@@ -20,10 +20,10 @@
 #include "../auxiliary/rocauxiliary_larft.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans, 
-                                   const rocsolver_int m, const rocsolver_int n, 
-                                   const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda, 
-                                   const rocsolver_int strideA, T* ipiv, 
+rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver_side side, const rocsolver_operation trans,
+                                   const rocsolver_int m, const rocsolver_int n,
+                                   const rocsolver_int k, U A, const rocsolver_int shiftA, const rocsolver_int lda,
+                                   const rocsolver_int strideA, T* ipiv,
                                    const rocsolver_int strideP, U C, const rocsolver_int shiftC, const rocsolver_int ldc,
                                    const rocsolver_int strideC, const rocsolver_int batch_count)
 {
@@ -35,14 +35,14 @@ rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver
     rocblas_get_stream(handle, &stream);
 
     // if the matrix is small, use the unblocked variant of the algorithm
-    if (k <= ORMQR_ORM2R_BLOCKSIZE) 
+    if (k <= ORMQR_ORM2R_BLOCKSIZE)
         return rocsolver_orm2r_template<T>(handle, side, trans, m, n, k, A, shiftA, lda, strideA, ipiv, strideP, C, shiftC, ldc, strideC, batch_count);
 
     //memory in GPU (workspace)
     T* work;
     rocblas_int ldw = ORMQR_ORM2R_BLOCKSIZE;
     rocblas_int strideW = ldw *ldw;
-    hipMalloc(&work, sizeof(T)*strideW*batch_count);    
+    hipMalloc(&work, sizeof(T)*strideW*batch_count);
 
     // determine limits and indices
     bool left = (side == rocblas_side_left);
@@ -100,7 +100,7 @@ rocblas_status rocsolver_ormqr_template(rocsolver_handle handle, const rocsolver
                                  C, shiftC + idx2D(ic,jc,ldc),ldc,strideC,
                                  batch_count);
     }
- 
+
     return rocblas_status_success;
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/src/common/rocblas.cpp b/ROCm_Libraries/rocSOLVER/src/common/rocblas.cpp
index 2d57c7d9..65dd0697 100644
--- a/ROCm_Libraries/rocSOLVER/src/common/rocblas.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/common/rocblas.cpp
@@ -104,7 +104,7 @@ rocblas_status rocblas_iamax(rocblas_handle handle, rocblas_int n,
   return rocblas_izamax(handle, n, x, incx, result);
 }
 
-//ger 
+//ger
 
 template <>
 rocblas_status rocblas_ger<false>(rocblas_handle handle, rocblas_int m, rocblas_int n,
diff --git a/ROCm_Libraries/rocSOLVER/src/include/common_device.hpp b/ROCm_Libraries/rocSOLVER/src/include/common_device.hpp
index 1aaaab61..d28acb79 100644
--- a/ROCm_Libraries/rocSOLVER/src/include/common_device.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/include/common_device.hpp
@@ -36,16 +36,16 @@ __forceinline__ __device__ __host__ T* load_ptr_batch(T *const p[], rocblas_int
 }
 
 template<typename T>
-__forceinline__ __global__ void get_array(T** out, T* in, rocblas_int stride, rocblas_int batch) 
+__forceinline__ __global__ void get_array(T** out, T* in, rocblas_int stride, rocblas_int batch)
 {
     int b = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-    
+
     if (b < batch)
         out[b] = in + b*stride;
 }
 
 template <typename T, typename U>
-__forceinline__ __global__ void setdiag(const rocblas_int j, U A, 
+__forceinline__ __global__ void setdiag(const rocblas_int j, U A,
                         const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA,
                         T *ipiv, const rocblas_int strideP)
 {
@@ -54,7 +54,7 @@ __forceinline__ __global__ void setdiag(const rocblas_int j, U A,
     T *tau = ipiv + b*strideP;
 
     T t = -tau[j];
-    tau[j] = t; 
+    tau[j] = t;
     Ap[j + j*lda] = 1.0 + t;
 }
 
diff --git a/ROCm_Libraries/rocSOLVER/src/include/ideal_sizes.hpp b/ROCm_Libraries/rocSOLVER/src/include/ideal_sizes.hpp
index 5d9cf574..260d9d1f 100644
--- a/ROCm_Libraries/rocSOLVER/src/include/ideal_sizes.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/include/ideal_sizes.hpp
@@ -8,7 +8,7 @@
 
 // IDEAL SIZES ARE DEFINED FOR NOW AS IN CPU-LAPACK
 // BENCHMARKING OF ROCSOLVER WILL BE NEEDED TO DETERMINE
-// MORE SUITABLE VALUES  
+// MORE SUITABLE VALUES
 
 
 
diff --git a/ROCm_Libraries/rocSOLVER/src/include/rocsolver_unique_ptr.hpp b/ROCm_Libraries/rocSOLVER/src/include/rocsolver_unique_ptr.hpp
index 185d1690..b7e34f6b 100644
--- a/ROCm_Libraries/rocSOLVER/src/include/rocsolver_unique_ptr.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/include/rocsolver_unique_ptr.hpp
@@ -1,24 +1,24 @@
-/* ************************************************************************
- * Copyright 2019-2020 Advanced Micro Devices, Inc.
- * ************************************************************************ */
-
-#ifndef GUARD_ROCBLAS_MANAGE_PTR_HPP
-#define GUARD_ROCBLAS_MANAGE_PTR_HPP
-
-#include <memory>
-
-namespace rocsolver {
-// device_malloc wraps hipMalloc and provides same API as malloc
-static void *device_malloc(size_t byte_size) {
-  void *pointer;
-  PRINT_IF_HIP_ERROR(hipMalloc(&pointer, byte_size));
-  return pointer;
-}
-
-// device_free wraps hipFree and provides same API as free
-static void device_free(void *ptr) { PRINT_IF_HIP_ERROR(hipFree(ptr)); }
-} // namespace rocsolver
-
-using rocsolver_unique_ptr = std::unique_ptr<void, void (*)(void *)>;
-
-#endif
+/* ************************************************************************
+ * Copyright 2019-2020 Advanced Micro Devices, Inc.
+ * ************************************************************************ */
+
+#ifndef GUARD_ROCBLAS_MANAGE_PTR_HPP
+#define GUARD_ROCBLAS_MANAGE_PTR_HPP
+
+#include <memory>
+
+namespace rocsolver {
+// device_malloc wraps hipMalloc and provides same API as malloc
+static void *device_malloc(size_t byte_size) {
+  void *pointer;
+  PRINT_IF_HIP_ERROR(hipMalloc(&pointer, byte_size));
+  return pointer;
+}
+
+// device_free wraps hipFree and provides same API as free
+static void device_free(void *ptr) { PRINT_IF_HIP_ERROR(hipFree(ptr)); }
+} // namespace rocsolver
+
+using rocsolver_unique_ptr = std::unique_ptr<void, void (*)(void *)>;
+
+#endif
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2.cpp
index d412d69a..f5f6d466 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_gelq2_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv) 
-{ 
+                                        T* ipiv)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m)
@@ -41,13 +41,13 @@ rocblas_status rocsolver_gelq2_impl(rocblas_handle handle, const rocblas_int m,
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgelq2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, float *ipiv) 
+                 const rocblas_int lda, float *ipiv)
 {
     return rocsolver_gelq2_impl<float>(handle, m, n, A, lda, ipiv);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgelq2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, double *ipiv) 
+                 const rocblas_int lda, double *ipiv)
 {
     return rocsolver_gelq2_impl<double>(handle, m, n, A, lda, ipiv);
 }
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2.hpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2.hpp
index 29c4266f..81ec19ae 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2.hpp
@@ -22,12 +22,12 @@
 
 template <typename T, typename U>
 rocblas_status rocsolver_gelq2_template(rocblas_handle handle, const rocblas_int m,
-                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, 
-                                        rocblas_int const strideA, T* ipiv,  
+                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda,
+                                        rocblas_int const strideA, T* ipiv,
                                         const rocblas_int strideP, const rocblas_int batch_count)
 {
     // quick return
-    if (m == 0 || n == 0 || batch_count == 0) 
+    if (m == 0 || n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     hipStream_t stream;
@@ -36,8 +36,8 @@ rocblas_status rocsolver_gelq2_template(rocblas_handle handle, const rocblas_int
     //memory in GPU (workspace)
     T *diag;
     hipMalloc(&diag,sizeof(T)*batch_count);
-   
-    rocblas_int dim = min(m, n);    //total number of pivots    
+
+    rocblas_int dim = min(m, n);    //total number of pivots
 
     for (rocblas_int j = 0; j < dim; ++j) {
         // generate Householder reflector to work on row j
@@ -45,18 +45,18 @@ rocblas_status rocsolver_gelq2_template(rocblas_handle handle, const rocblas_int
                                  n - j,                                 //order of reflector
                                  A, shiftA + idx2D(j,j,lda),            //value of alpha
                                  A, shiftA + idx2D(j,min(j+1,n-1),lda), //vector x to work on
-                                 lda, strideA,                          //inc of x    
+                                 lda, strideA,                          //inc of x
                                  (ipiv + j), strideP,                   //tau
                                  batch_count);
 
-        // insert one in A(j,j) tobuild/apply the householder matrix 
+        // insert one in A(j,j) tobuild/apply the householder matrix
         hipLaunchKernelGGL(set_one_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(j,j,lda),strideA);
 
-        // Apply Householder reflector to the rest of matrix from the right 
+        // Apply Householder reflector to the rest of matrix from the right
         if (j < m - 1) {
             rocsolver_larf_template(handle,rocblas_side_right,          //side
                                     m - j - 1,                          //number of rows of matrix to modify
-                                    n - j,                              //number of columns of matrix to modify    
+                                    n - j,                              //number of columns of matrix to modify
                                     A, shiftA + idx2D(j,j,lda),         //householder vector x
                                     lda, strideA,                       //inc of x
                                     (ipiv + j), strideP,                //householder scalar (alpha)
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2_batched.cpp
index 027572df..35fe7af5 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2_batched.cpp
@@ -8,13 +8,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_gelq2_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -40,13 +40,13 @@ rocblas_status rocsolver_gelq2_batched_impl(rocblas_handle handle, const rocblas
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgelq2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[],
-                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelq2_batched_impl<float>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgelq2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[],
-                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelq2_batched_impl<double>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2_strided_batched.cpp
index 9eefcb03..569facbb 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelq2_strided_batched.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_gelq2_strided_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -38,13 +38,13 @@ rocblas_status rocsolver_gelq2_strided_batched_impl(rocblas_handle handle, const
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgelq2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelq2_strided_batched_impl<float>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgelq2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelq2_strided_batched_impl<double>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf.cpp
index a29c5b0f..f75a0da7 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_gelqf_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv) 
-{ 
+                                        T* ipiv)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m)
@@ -41,13 +41,13 @@ rocblas_status rocsolver_gelqf_impl(rocblas_handle handle, const rocblas_int m,
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgelqf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, float *ipiv) 
+                 const rocblas_int lda, float *ipiv)
 {
     return rocsolver_gelqf_impl<float>(handle, m, n, A, lda, ipiv);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgelqf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, double *ipiv) 
+                 const rocblas_int lda, double *ipiv)
 {
     return rocsolver_gelqf_impl<double>(handle, m, n, A, lda, ipiv);
 }
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf.hpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf.hpp
index b0e15bef..d40b9dd5 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf.hpp
@@ -24,21 +24,21 @@
 
 template <typename T, typename U>
 rocblas_status rocsolver_gelqf_template(rocblas_handle handle, const rocblas_int m,
-                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, 
-                                        rocblas_int const strideA, T* ipiv,  
+                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda,
+                                        rocblas_int const strideA, T* ipiv,
                                         const rocblas_int strideP, const rocblas_int batch_count)
 {
     // quick return
-    if (m == 0 || n == 0 || batch_count == 0) 
+    if (m == 0 || n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
 
     // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm
-    if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE) 
+    if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE)
         return rocsolver_gelq2_template<T>(handle, m, n, A, shiftA, lda, strideA, ipiv, strideP, batch_count);
-    
+
     rocblas_int dim = min(m, n);    //total number of pivots
     rocblas_int jb, j = 0;
 
@@ -49,17 +49,17 @@ rocblas_status rocsolver_gelqf_template(rocblas_handle handle, const rocblas_int
     hipMalloc(&work, sizeof(T)*strideW*batch_count);
 
     while (j < dim - GEQRF_GEQR2_SWITCHSIZE) {
-        // Factor diagonal and subdiagonal blocks 
+        // Factor diagonal and subdiagonal blocks
         jb = min(dim - j, GEQRF_GEQR2_BLOCKSIZE);  //number of rows in the block
         rocsolver_gelq2_template<T>(handle, jb, n-j, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count);
 
         //apply transformation to the rest of the matrix
         if (j + jb < m) {
-            
+
             //compute block reflector
-            rocsolver_larft_template<T>(handle, rocsolver_forward_direction, 
-                                        rocsolver_row_wise, n-j, jb, 
-                                        A, shiftA + idx2D(j,j,lda), lda, strideA, 
+            rocsolver_larft_template<T>(handle, rocsolver_forward_direction,
+                                        rocsolver_row_wise, n-j, jb,
+                                        A, shiftA + idx2D(j,j,lda), lda, strideA,
                                         (ipiv + j), strideP,
                                         work, ldw, strideW, batch_count);
 
@@ -76,9 +76,9 @@ rocblas_status rocsolver_gelqf_template(rocblas_handle handle, const rocblas_int
     }
 
     //factor last block
-    if (j < dim) 
+    if (j < dim)
         rocsolver_gelq2_template<T>(handle, m-j, n-j, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count);
-        
+
     hipFree(work);
 
     return rocblas_status_success;
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf_batched.cpp
index 91631008..cee74932 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf_batched.cpp
@@ -8,13 +8,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_gelqf_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -40,13 +40,13 @@ rocblas_status rocsolver_gelqf_batched_impl(rocblas_handle handle, const rocblas
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgelqf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[],
-                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelqf_batched_impl<float>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgelqf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[],
-                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelqf_batched_impl<double>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf_strided_batched.cpp
index 13e0312f..a5581819 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_gelqf_strided_batched.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_gelqf_strided_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -38,13 +38,13 @@ rocblas_status rocsolver_gelqf_strided_batched_impl(rocblas_handle handle, const
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgelqf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelqf_strided_batched_impl<float>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgelqf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_gelqf_strided_batched_impl<double>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2.cpp
index 0cae47b0..249784a0 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_geqr2_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv) 
-{ 
+                                        T* ipiv)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m)
@@ -41,13 +41,13 @@ rocblas_status rocsolver_geqr2_impl(rocblas_handle handle, const rocblas_int m,
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqr2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, float *ipiv) 
+                 const rocblas_int lda, float *ipiv)
 {
     return rocsolver_geqr2_impl<float>(handle, m, n, A, lda, ipiv);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqr2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, double *ipiv) 
+                 const rocblas_int lda, double *ipiv)
 {
     return rocsolver_geqr2_impl<double>(handle, m, n, A, lda, ipiv);
 }
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2.hpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2.hpp
index 668fc8a0..485550d7 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2.hpp
@@ -22,12 +22,12 @@
 
 template <typename T, typename U>
 rocblas_status rocsolver_geqr2_template(rocblas_handle handle, const rocblas_int m,
-                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, 
-                                        rocblas_int const strideA, T* ipiv,  
+                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda,
+                                        rocblas_int const strideA, T* ipiv,
                                         const rocblas_int strideP, const rocblas_int batch_count)
 {
     // quick return
-    if (m == 0 || n == 0 || batch_count == 0) 
+    if (m == 0 || n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     hipStream_t stream;
@@ -36,8 +36,8 @@ rocblas_status rocsolver_geqr2_template(rocblas_handle handle, const rocblas_int
     //memory in GPU (workspace)
     T *diag;
     hipMalloc(&diag,sizeof(T)*batch_count);
-   
-    rocblas_int dim = min(m, n);    //total number of pivots    
+
+    rocblas_int dim = min(m, n);    //total number of pivots
 
     for (rocblas_int j = 0; j < dim; ++j) {
         // generate Householder reflector to work on column j
@@ -45,18 +45,18 @@ rocblas_status rocsolver_geqr2_template(rocblas_handle handle, const rocblas_int
                                  m - j,                                 //order of reflector
                                  A, shiftA + idx2D(j,j,lda),            //value of alpha
                                  A, shiftA + idx2D(min(j+1,m-1),j,lda), //vector x to work on
-                                 1, strideA,                            //inc of x    
+                                 1, strideA,                            //inc of x
                                  (ipiv + j), strideP,                   //tau
                                  batch_count);
 
-        // insert one in A(j,j) tobuild/apply the householder matrix 
+        // insert one in A(j,j) tobuild/apply the householder matrix
         hipLaunchKernelGGL(set_one_diag,dim3(batch_count,1,1),dim3(1,1,1),0,stream,diag,A,shiftA+idx2D(j,j,lda),strideA);
 
-        // Apply Householder reflector to the rest of matrix from the left 
+        // Apply Householder reflector to the rest of matrix from the left
         if (j < n - 1) {
             rocsolver_larf_template(handle,rocblas_side_left,           //side
                                     m - j,                              //number of rows of matrix to modify
-                                    n - j - 1,                          //number of columns of matrix to modify    
+                                    n - j - 1,                          //number of columns of matrix to modify
                                     A, shiftA + idx2D(j,j,lda),         //householder vector x
                                     1, strideA,                         //inc of x
                                     (ipiv + j), strideP,                //householder scalar (alpha)
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2_batched.cpp
index ef67a2eb..70e765e8 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2_batched.cpp
@@ -8,13 +8,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_geqr2_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -40,13 +40,13 @@ rocblas_status rocsolver_geqr2_batched_impl(rocblas_handle handle, const rocblas
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqr2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[],
-                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqr2_batched_impl<float>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqr2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[],
-                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqr2_batched_impl<double>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2_strided_batched.cpp
index 26816634..e468de7e 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqr2_strided_batched.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_geqr2_strided_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -38,13 +38,13 @@ rocblas_status rocsolver_geqr2_strided_batched_impl(rocblas_handle handle, const
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqr2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqr2_strided_batched_impl<float>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqr2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqr2_strided_batched_impl<double>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf.cpp
index d941c762..b91aa412 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_geqrf_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv) 
-{ 
+                                        T* ipiv)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m)
@@ -41,13 +41,13 @@ rocblas_status rocsolver_geqrf_impl(rocblas_handle handle, const rocblas_int m,
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqrf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, float *ipiv) 
+                 const rocblas_int lda, float *ipiv)
 {
     return rocsolver_geqrf_impl<float>(handle, m, n, A, lda, ipiv);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqrf(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, double *ipiv) 
+                 const rocblas_int lda, double *ipiv)
 {
     return rocsolver_geqrf_impl<double>(handle, m, n, A, lda, ipiv);
 }
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf.hpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf.hpp
index fcdb4935..e1a3adaf 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf.hpp
@@ -24,21 +24,21 @@
 
 template <typename T, typename U>
 rocblas_status rocsolver_geqrf_template(rocblas_handle handle, const rocblas_int m,
-                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, 
-                                        rocblas_int const strideA, T* ipiv,  
+                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda,
+                                        rocblas_int const strideA, T* ipiv,
                                         const rocblas_int strideP, const rocblas_int batch_count)
 {
     // quick return
-    if (m == 0 || n == 0 || batch_count == 0) 
+    if (m == 0 || n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
 
     // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm
-    if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE) 
+    if (m <= GEQRF_GEQR2_SWITCHSIZE || n <= GEQRF_GEQR2_SWITCHSIZE)
         return rocsolver_geqr2_template<T>(handle, m, n, A, shiftA, lda, strideA, ipiv, strideP, batch_count);
-    
+
     rocblas_int dim = min(m, n);    //total number of pivots
     rocblas_int jb, j = 0;
 
@@ -49,17 +49,17 @@ rocblas_status rocsolver_geqrf_template(rocblas_handle handle, const rocblas_int
     hipMalloc(&work, sizeof(T)*strideW*batch_count);
 
     while (j < dim - GEQRF_GEQR2_SWITCHSIZE) {
-        // Factor diagonal and subdiagonal blocks 
+        // Factor diagonal and subdiagonal blocks
         jb = min(dim - j, GEQRF_GEQR2_BLOCKSIZE);  //number of columns in the block
         rocsolver_geqr2_template<T>(handle, m-j, jb, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count);
 
         //apply transformation to the rest of the matrix
         if (j + jb < n) {
-            
+
             //compute block reflector
-            rocsolver_larft_template<T>(handle, rocsolver_forward_direction, 
-                                        rocsolver_column_wise, m-j, jb, 
-                                        A, shiftA + idx2D(j,j,lda), lda, strideA, 
+            rocsolver_larft_template<T>(handle, rocsolver_forward_direction,
+                                        rocsolver_column_wise, m-j, jb,
+                                        A, shiftA + idx2D(j,j,lda), lda, strideA,
                                         (ipiv + j), strideP,
                                         work, ldw, strideW, batch_count);
 
@@ -75,9 +75,9 @@ rocblas_status rocsolver_geqrf_template(rocblas_handle handle, const rocblas_int
     }
 
     //factor last block
-    if (j < dim) 
+    if (j < dim)
         rocsolver_geqr2_template<T>(handle, m-j, n-j, A, shiftA + idx2D(j,j,lda), lda, strideA, (ipiv + j), strideP, batch_count);
-        
+
     hipFree(work);
 
     return rocblas_status_success;
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf_batched.cpp
index 3ae16e6a..41bb01e6 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf_batched.cpp
@@ -8,13 +8,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_geqrf_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -40,13 +40,13 @@ rocblas_status rocsolver_geqrf_batched_impl(rocblas_handle handle, const rocblas
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqrf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[],
-                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqrf_batched_impl<float>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqrf_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[],
-                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqrf_batched_impl<double>(handle, m, n, A, lda, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf_strided_batched.cpp
index b3e3809d..bd670e1f 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_geqrf_strided_batched.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_geqrf_strided_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
-                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
-{ 
+                                        T* ipiv, const rocblas_int stridep, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -38,13 +38,13 @@ rocblas_status rocsolver_geqrf_strided_batched_impl(rocblas_handle handle, const
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgeqrf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, float *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqrf_strided_batched_impl<float>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgeqrf_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, double *ipiv, const rocblas_int stridep, const rocblas_int batch_count)
 {
     return rocsolver_geqrf_strided_batched_impl<double>(handle, m, n, A, lda, strideA, ipiv, stridep, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2.cpp
index 9b01a5af..d74da116 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2.cpp
@@ -7,13 +7,13 @@
 template <typename T, typename U>
 rocblas_status rocsolver_getf2_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        rocblas_int *ipiv, rocblas_int* info) 
-{ 
+                                        rocblas_int *ipiv, rocblas_int* info)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv || !info)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || lda < 1)
@@ -41,25 +41,25 @@ rocblas_status rocsolver_getf2_impl(rocblas_handle handle, const rocblas_int m,
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info) 
+                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info)
 {
     return rocsolver_getf2_impl<float>(handle, m, n, A, lda, ipiv, info);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info ) 
+                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info )
 {
     return rocsolver_getf2_impl<double>(handle, m, n, A, lda, ipiv, info);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_cgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *A,
-                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info) 
+                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info)
 {
     return rocsolver_getf2_impl<rocblas_float_complex>(handle, m, n, A, lda, ipiv, info);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zgetf2(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *A,
-                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info ) 
+                 const rocblas_int lda, rocblas_int *ipiv, rocblas_int* info )
 {
     return rocsolver_getf2_impl<rocblas_double_complex>(handle, m, n, A, lda, ipiv, info);
 }
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2.hpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2.hpp
index 727a76c3..5630004e 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2.hpp
@@ -44,14 +44,14 @@ inline __global__ void getf2_check_singularity(U AA, const rocblas_int shiftA, c
 
 template <typename T, typename U>
 rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int m,
-                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, 
-                                        rocblas_int const strideA, rocblas_int *ipiv, const rocblas_int shiftP, 
+                                        const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda,
+                                        rocblas_int const strideA, rocblas_int *ipiv, const rocblas_int shiftP,
                                         const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     // quick return
-    if (m == 0 || n == 0 || batch_count == 0) 
+    if (m == 0 || n == 0 || batch_count == 0)
         return rocblas_status_success;
-    
+
     #ifdef batched
         // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL
         //      BATCH-BLAS FUNCTIONALITY IS ENABLED. ****
@@ -69,7 +69,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int
     hipMemcpy(minoneInt, &minone, sizeof(T), hipMemcpyHostToDevice);
 
     //pivoting info in device (to avoid continuous synchronization with CPU)
-    T *pivotGPU; 
+    T *pivotGPU;
     hipMalloc(&pivotGPU, sizeof(T)*batch_count);
 
     hipStream_t stream;
@@ -84,7 +84,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int
 
     //info=0 (starting with a nonsingular matrix)
     hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,info,batch_count,0);
-    
+
     // **** BATCH IS EXECUTED IN A FOR-LOOP UNTIL BATCH-BLAS
     //      FUNCITONALITY IS ENABLED. ALSO ROCBLAS CALLS SHOULD
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
@@ -93,7 +93,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int
         // find pivot. Use Fortran 1-based indexing for the ipiv array as iamax does that as well!
         for (int b=0;b<batch_count;++b) {
             M = load_ptr_batch<T>(AA,shiftA,b,strideA);
-            rocblas_iamax(handle, m - j, (M + idx2D(j, j, lda)), 1, 
+            rocblas_iamax(handle, m - j, (M + idx2D(j, j, lda)), 1,
                         (ipiv + shiftP + b*strideP + j));
         }
 
@@ -101,14 +101,14 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int
         hipLaunchKernelGGL(getf2_check_singularity<T>, dim3(batch_count), dim3(1), 0, stream,
                   A, shiftA, strideA, ipiv, shiftP, strideP, j, lda, pivotGPU, info);
 
-        // Swap pivot row and j-th row 
+        // Swap pivot row and j-th row
         rocsolver_laswp_template<T>(handle, n, A, shiftA, lda, strideA, j+1, j+1, ipiv, shiftP, strideP, 1, batch_count);
 
         // Compute elements J+1:M of J'th column
         for (int b=0;b<batch_count;++b) {
             M = load_ptr_batch<T>(AA,shiftA,b,strideA);
-            rocblas_scal(handle, (m-j-1), (pivotGPU + b), 
-                            (M + idx2D(j + 1, j, lda)), oneInt); 
+            rocblas_scal(handle, (m-j-1), (pivotGPU + b),
+                            (M + idx2D(j + 1, j, lda)), oneInt);
         }
 
         // update trailing submatrix
@@ -116,7 +116,7 @@ rocblas_status rocsolver_getf2_template(rocblas_handle handle, const rocblas_int
             for (int b=0;b<batch_count;++b) {
                 M = load_ptr_batch<T>(AA,shiftA,b,strideA);
                 rocblas_ger<false>(handle, m - j - 1, n - j - 1, minoneInt,
-                        (M + idx2D(j + 1, j, lda)), oneInt, 
+                        (M + idx2D(j + 1, j, lda)), oneInt,
                         (M + idx2D(j, j + 1, lda)), lda,
                         (M + idx2D(j + 1, j + 1, lda)), lda);
             }
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2_batched.cpp
index bd9e7240..462e932d 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2_batched.cpp
@@ -8,14 +8,14 @@
 template <typename T, typename U>
 rocblas_status rocsolver_getf2_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda,
-                                        rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
-{ 
+                                        rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
+{
 
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv || !info)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
@@ -40,25 +40,25 @@ rocblas_status rocsolver_getf2_batched_impl(rocblas_handle handle, const rocblas
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *const A[],
-                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_batched_impl<float>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *const A[],
-                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_batched_impl<double>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_cgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *const A[],
-                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_batched_impl<rocblas_float_complex>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zgetf2_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *const A[],
-                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_batched_impl<rocblas_double_complex>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2_strided_batched.cpp
index ccb2d252..b3ea05e9 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getf2_strided_batched.cpp
@@ -7,19 +7,19 @@
 template <typename T, typename U>
 rocblas_status rocsolver_getf2_strided_batched_impl(rocblas_handle handle, const rocblas_int m,
                                         const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
-                                        rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
-{ 
+                                        rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
+{
 
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !ipiv || !info)
         return rocblas_status_invalid_pointer;
     if (m < 0 || n < 0 || lda < m || batch_count < 0)
         return rocblas_status_invalid_size;
-        
+
 
     return rocsolver_getf2_template<T>(handle,m,n,
                                     A,0,    //the matrix is shifted 0 entries (will work on the entire matrix)
@@ -39,25 +39,25 @@ rocblas_status rocsolver_getf2_strided_batched_impl(rocblas_handle handle, const
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, float *A,
-                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_strided_batched_impl<float>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, double *A,
-                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_strided_batched_impl<double>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_cgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_float_complex *A,
-                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_strided_batched_impl<rocblas_float_complex>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zgetf2_strided_batched(rocblas_handle handle, const rocblas_int m, const rocblas_int n, rocblas_double_complex *A,
-                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 const rocblas_int lda, const rocblas_int strideA, rocblas_int* ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getf2_strided_batched_impl<rocblas_double_complex>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf.cpp
index 4a1c1b91..9b3bdf70 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf.cpp
@@ -10,10 +10,10 @@ rocblas_status rocsolver_getrf_impl(rocblas_handle handle, const rocblas_int m,
                                         rocblas_int *ipiv, rocblas_int* info) {
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
 
-    if (m < 0 || n < 0 || lda < m) 
+    //logging is missing ???
+
+    if (m < 0 || n < 0 || lda < m)
         return rocblas_status_invalid_size;
     if (!A || !ipiv || !info)
         return rocblas_status_invalid_pointer;
@@ -40,25 +40,25 @@ rocblas_status rocsolver_getrf_impl(rocblas_handle handle, const rocblas_int m,
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 float *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) 
+                 float *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info)
 {
     return rocsolver_getrf_impl<float>(handle, m, n, A, lda, ipiv, info);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 double *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) 
+                 double *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info)
 {
     return rocsolver_getrf_impl<double>(handle, m, n, A, lda, ipiv, info);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_cgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 rocblas_float_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) 
+                 rocblas_float_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info)
 {
     return rocsolver_getrf_impl<rocblas_float_complex>(handle, m, n, A, lda, ipiv, info);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zgetrf(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 rocblas_double_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info) 
+                 rocblas_double_complex *A, const rocsolver_int lda, rocsolver_int *ipiv, rocblas_int* info)
 {
     return rocsolver_getrf_impl<rocblas_double_complex>(handle, m, n, A, lda, ipiv, info);
 }
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf.hpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf.hpp
index f19138bb..395fd187 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf.hpp
@@ -41,13 +41,13 @@ rocblas_status rocsolver_getrf_template(rocblas_handle handle, const rocblas_int
                                         const rocblas_int n, U A, const rocblas_int shiftA, const rocblas_int lda, const rocblas_int strideA,
                                         rocblas_int *ipiv, const rocblas_int shiftP, const rocblas_int strideP, rocblas_int *info, const rocblas_int batch_count) {
     // quick return
-    if (m == 0 || n == 0 || batch_count == 0) 
+    if (m == 0 || n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm
-    if (m < GETRF_GETF2_SWITCHSIZE || n < GETRF_GETF2_SWITCHSIZE) 
+    if (m < GETRF_GETF2_SWITCHSIZE || n < GETRF_GETF2_SWITCHSIZE)
         return rocsolver_getf2_template<T>(handle, m, n, A, shiftA, lda, strideA, ipiv, shiftP, strideP, info, batch_count);
-  
+
     #ifdef batched
         // **** THIS SYNCHRONIZATION WILL BE REQUIRED UNTIL
         //      BATCH-BLAS FUNCTIONALITY IS ENABLED. ****
@@ -92,14 +92,14 @@ rocblas_status rocsolver_getrf_template(rocblas_handle handle, const rocblas_int
     //      BE MADE TO THE CORRESPONDING TEMPLATE_FUNCTIONS ****
 
     for (int j = 0; j < dim; j += GETRF_GETF2_SWITCHSIZE) {
-        // Factor diagonal and subdiagonal blocks 
+        // Factor diagonal and subdiagonal blocks
         jb = min(dim - j, GETRF_GETF2_SWITCHSIZE);  //number of columns in the block
         hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,iinfo,batch_count,0);
         rocsolver_getf2_template<T>(handle, m - j, jb, A, shiftA + idx2D(j, j, lda), lda, strideA, ipiv, shiftP + j, strideP, iinfo, batch_count);
-        
+
         // adjust pivot indices and check singularity
         sizePivot = min(m - j, jb);     //number of pivots in the block
-        blocksPivot = (sizePivot - 1) / GETF2_BLOCKSIZE + 1; 
+        blocksPivot = (sizePivot - 1) / GETF2_BLOCKSIZE + 1;
         gridPivot = dim3(blocksPivot, batch_count, 1);
         hipLaunchKernelGGL(getrf_check_singularity, gridPivot, threads, 0, stream, sizePivot, j, ipiv, shiftP + j, strideP, iinfo, info);
 
@@ -131,7 +131,7 @@ rocblas_status rocsolver_getrf_template(rocblas_handle handle, const rocblas_int
                                  (M + idx2D(j + jb, j + jb, lda)), lda);
                 }
             }
-        } 
+        }
     }
 
     hipFree(pivotGPU);
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf_batched.cpp
index 5ed946d0..44317213 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf_batched.cpp
@@ -7,14 +7,14 @@
 
 template <typename T, typename U>
 rocblas_status rocsolver_getrf_batched_impl(rocblas_handle handle, rocblas_int m,
-                                        rocblas_int n, U A, rocblas_int lda, 
+                                        rocblas_int n, U A, rocblas_int lda,
                                         rocblas_int *ipiv, const rocblas_int strideP, rocblas_int* info, rocblas_int batch_count) {
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
 
-    if (m < 0 || n < 0 || batch_count < 0 || lda < m) 
+    //logging is missing ???
+
+    if (m < 0 || n < 0 || batch_count < 0 || lda < m)
         return rocblas_status_invalid_size;
     if (!A || !ipiv || !info)
         return rocblas_status_invalid_pointer;
@@ -39,25 +39,25 @@ rocblas_status rocsolver_getrf_batched_impl(rocblas_handle handle, rocblas_int m
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 float *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 float *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getrf_batched_impl<float>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 double *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count) 
+                 double *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count)
 {
     return rocsolver_getrf_batched_impl<double>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_cgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 rocblas_float_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 rocblas_float_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getrf_batched_impl<rocblas_float_complex>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zgetrf_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 rocblas_double_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count) 
+                 rocblas_double_complex *const A[], const rocsolver_int lda, rocsolver_int *ipiv, const rocblas_int strideP, rocsolver_int* info, const rocsolver_int batch_count)
 {
     return rocsolver_getrf_batched_impl<rocblas_double_complex>(handle, m, n, A, lda, ipiv, strideP, info, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf_strided_batched.cpp
index c1ef590b..35443146 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrf_strided_batched.cpp
@@ -10,10 +10,10 @@ rocblas_status rocsolver_getrf_strided_batched_impl(rocblas_handle handle, const
                                         rocblas_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) {
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
 
-    if (m < 0 || n < 0 || batch_count  < 0 || lda < m) 
+    //logging is missing ???
+
+    if (m < 0 || n < 0 || batch_count  < 0 || lda < m)
         return rocblas_status_invalid_size;
     if (!A || !ipiv || !info)
         return rocblas_status_invalid_pointer;
@@ -36,25 +36,25 @@ rocblas_status rocsolver_getrf_strided_batched_impl(rocblas_handle handle, const
 extern "C" {
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_sgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 float *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 float *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getrf_strided_batched_impl<float>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_dgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 double *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 double *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getrf_strided_batched_impl<double>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_cgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 rocblas_float_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 rocblas_float_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getrf_strided_batched_impl<rocblas_float_complex>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_zgetrf_strided_batched(rocsolver_handle handle, const rocsolver_int m, const rocsolver_int n,
-                 rocblas_double_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count) 
+                 rocblas_double_complex *A, const rocsolver_int lda, const rocblas_int strideA, rocsolver_int *ipiv, const rocblas_int strideP, rocblas_int* info, const rocblas_int batch_count)
 {
     return rocsolver_getrf_strided_batched_impl<rocblas_double_complex>(handle, m, n, A, lda, strideA, ipiv, strideP, info, batch_count);
 }
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs.cpp
index 255e306c..435339c1 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs.cpp
@@ -7,14 +7,14 @@
 template <typename T>
 rocblas_status rocsolver_getrs_impl(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, T *A, const rocblas_int lda,
-                 const rocblas_int *ipiv, T *B, const rocblas_int ldb) 
+                 const rocblas_int *ipiv, T *B, const rocblas_int ldb)
 {
     if(!handle)
         return rocblas_status_invalid_handle;
 
-    //logging is missing ???    
+    //logging is missing ???
 
-    if (n < 0 || nrhs < 0 || lda < n || ldb < n) 
+    if (n < 0 || nrhs < 0 || lda < n || ldb < n)
         return rocblas_status_invalid_size;
 
     if (!A || !ipiv || !B)
@@ -45,7 +45,7 @@ rocblas_status rocsolver_getrs_impl(rocblas_handle handle, const rocblas_operati
 extern "C" ROCSOLVER_EXPORT rocblas_status
 rocsolver_sgetrs(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, float *A, const rocblas_int lda,
-                 const rocblas_int *ipiv, float *B, const rocblas_int ldb) 
+                 const rocblas_int *ipiv, float *B, const rocblas_int ldb)
 {
   return rocsolver_getrs_impl<float>(handle, trans, n, nrhs, A, lda, ipiv, B, ldb);
 }
@@ -53,21 +53,21 @@ rocsolver_sgetrs(rocblas_handle handle, const rocblas_operation trans, const roc
 extern "C" ROCSOLVER_EXPORT rocblas_status
 rocsolver_dgetrs(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, double *A, const rocblas_int lda,
-                 const rocblas_int *ipiv, double *B, const rocblas_int ldb) 
+                 const rocblas_int *ipiv, double *B, const rocblas_int ldb)
 {
   return rocsolver_getrs_impl<double>(handle, trans, n, nrhs, A, lda, ipiv, B, ldb);
 }
 
-extern "C" ROCSOLVER_EXPORT rocsolver_status 
+extern "C" ROCSOLVER_EXPORT rocsolver_status
 rocsolver_cgetrs(
     rocsolver_handle handle, const rocsolver_operation trans, const rocsolver_int n,
     const rocsolver_int nrhs, rocblas_float_complex *A, const rocsolver_int lda,
-    const rocsolver_int *ipiv, rocblas_float_complex *B, const rocsolver_int ldb) 
+    const rocsolver_int *ipiv, rocblas_float_complex *B, const rocsolver_int ldb)
 {
   return rocsolver_getrs_impl<rocblas_float_complex>(handle, trans, n, nrhs, A, lda, ipiv, B, ldb);
 }
 
-extern "C" ROCSOLVER_EXPORT rocsolver_status 
+extern "C" ROCSOLVER_EXPORT rocsolver_status
 rocsolver_zgetrs(
     rocsolver_handle handle, const rocsolver_operation trans, const rocsolver_int n,
     const rocsolver_int nrhs, rocblas_double_complex *A, const rocsolver_int lda,
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs.hpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs.hpp
index 1209770f..e18816df 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs.hpp
@@ -19,7 +19,7 @@ template <typename T, typename U>
 rocblas_status rocsolver_getrs_template(rocblas_handle handle, const rocblas_operation trans,
                          const rocblas_int n, const rocblas_int nrhs, U A, const rocblas_int shiftA,
                          const rocblas_int lda, const rocblas_int strideA, const rocblas_int *ipiv, const rocblas_int strideP, U B,
-                         const rocblas_int shiftB, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) 
+                         const rocblas_int shiftB, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count)
 {
     // quick return
     if (n == 0 || nrhs == 0 || batch_count == 0) {
@@ -56,7 +56,7 @@ rocblas_status rocsolver_getrs_template(rocblas_handle handle, const rocblas_ope
         for (int b = 0; b < batch_count; ++b) {
             Ap = load_ptr_batch<T>(AA,shiftA,b,strideA);
             Bp = load_ptr_batch<T>(BB,shiftB,b,strideB);
-            
+
             // solve L*X = B, overwriting B with X
             rocblas_trsm<T>(handle, rocblas_side_left, rocblas_fill_lower,
                     trans, rocblas_diagonal_unit, n, nrhs,
@@ -67,13 +67,13 @@ rocblas_status rocsolver_getrs_template(rocblas_handle handle, const rocblas_ope
                     trans, rocblas_diagonal_non_unit, n, nrhs,
                     oneInt, Ap, lda, Bp, ldb);
         }
-    
+
     } else {
 
         for (int b = 0; b < batch_count; ++b) {
             Ap = load_ptr_batch<T>(AA,shiftA,b,strideA);
             Bp = load_ptr_batch<T>(BB,shiftB,b,strideB);
-            
+
             // solve U**T *X = B or U**H *X = B, overwriting B with X
             rocblas_trsm<T>(handle, rocblas_side_left, rocblas_fill_upper, trans,
                     rocblas_diagonal_non_unit, n, nrhs,
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs_batched.cpp
index dd2dbe6a..43d48ac5 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs_batched.cpp
@@ -8,14 +8,14 @@
 template <typename T, typename U>
 rocblas_status rocsolver_getrs_batched_impl(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, U A, const rocblas_int lda,
-                 const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int batch_count) 
+                 const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int batch_count)
 {
     if(!handle)
         return rocblas_status_invalid_handle;
 
-    //logging is missing ???    
+    //logging is missing ???
 
-    if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0) 
+    if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0)
         return rocblas_status_invalid_size;
 
     if (!A || !ipiv || !B)
@@ -44,7 +44,7 @@ rocblas_status rocsolver_getrs_batched_impl(rocblas_handle handle, const rocblas
 extern "C" ROCSOLVER_EXPORT rocblas_status
 rocsolver_sgetrs_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, float *const A[], const rocblas_int lda,
-                 const rocblas_int *ipiv, const rocblas_int strideP, float *const B[], const rocblas_int ldb, const rocblas_int batch_count) 
+                 const rocblas_int *ipiv, const rocblas_int strideP, float *const B[], const rocblas_int ldb, const rocblas_int batch_count)
 {
   return rocsolver_getrs_batched_impl<float>(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count);
 }
@@ -52,26 +52,26 @@ rocsolver_sgetrs_batched(rocblas_handle handle, const rocblas_operation trans, c
 extern "C" ROCSOLVER_EXPORT rocblas_status
 rocsolver_dgetrs_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, double *const A[], const rocblas_int lda,
-                 const rocblas_int *ipiv, const rocblas_int strideP, double *const B[], const rocblas_int ldb, const rocblas_int batch_count) 
+                 const rocblas_int *ipiv, const rocblas_int strideP, double *const B[], const rocblas_int ldb, const rocblas_int batch_count)
 {
   return rocsolver_getrs_batched_impl<double>(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count);
 }
 
-extern "C" ROCSOLVER_EXPORT rocsolver_status 
+extern "C" ROCSOLVER_EXPORT rocsolver_status
 rocsolver_cgetrs_batched(
                  rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, rocblas_float_complex *const A[], const rocblas_int lda,
-                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *const B[], 
+                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *const B[],
                  const rocblas_int ldb, const rocblas_int batch_count)
 {
   return rocsolver_getrs_batched_impl<rocblas_float_complex>(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count);
 }
 
-extern "C" ROCSOLVER_EXPORT rocsolver_status 
+extern "C" ROCSOLVER_EXPORT rocsolver_status
 rocsolver_zgetrs_batched(
                  rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, rocblas_double_complex *const A[], const rocblas_int lda,
-                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *const B[], 
+                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *const B[],
                  const rocblas_int ldb, const rocblas_int batch_count)
 {
   return rocsolver_getrs_batched_impl<rocblas_double_complex>(handle, trans, n, nrhs, A, lda, ipiv, strideP, B, ldb, batch_count);
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs_strided_batched.cpp
index 49ced525..e42302d3 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_getrs_strided_batched.cpp
@@ -7,14 +7,14 @@
 template <typename T, typename U>
 rocblas_status rocsolver_getrs_strided_batched_impl(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, U A, const rocblas_int lda, const rocblas_int strideA,
-                 const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) 
+                 const rocblas_int *ipiv, const rocblas_int strideP, U B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count)
 {
     if(!handle)
         return rocblas_status_invalid_handle;
 
-    //logging is missing ???    
+    //logging is missing ???
 
-    if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0) 
+    if (n < 0 || nrhs < 0 || lda < n || ldb < n || batch_count < 0)
         return rocblas_status_invalid_size;
 
     if (!A || !ipiv || !B)
@@ -40,7 +40,7 @@ rocblas_status rocsolver_getrs_strided_batched_impl(rocblas_handle handle, const
 extern "C" ROCSOLVER_EXPORT rocblas_status
 rocsolver_sgetrs_strided_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, float *A, const rocblas_int lda, const rocblas_int strideA,
-                 const rocblas_int *ipiv, const rocblas_int strideP, float *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) 
+                 const rocblas_int *ipiv, const rocblas_int strideP, float *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count)
 {
   return rocsolver_getrs_strided_batched_impl<float>(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count);
 }
@@ -48,26 +48,26 @@ rocsolver_sgetrs_strided_batched(rocblas_handle handle, const rocblas_operation
 extern "C" ROCSOLVER_EXPORT rocblas_status
 rocsolver_dgetrs_strided_batched(rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, double *A, const rocblas_int lda, const rocblas_int strideA,
-                 const rocblas_int *ipiv, const rocblas_int strideP, double *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count) 
+                 const rocblas_int *ipiv, const rocblas_int strideP, double *B, const rocblas_int ldb, const rocblas_int strideB, const rocblas_int batch_count)
 {
   return rocsolver_getrs_strided_batched_impl<double>(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count);
 }
 
-extern "C" ROCSOLVER_EXPORT rocsolver_status 
+extern "C" ROCSOLVER_EXPORT rocsolver_status
 rocsolver_cgetrs_strided_batched(
                  rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, rocblas_float_complex *A, const rocblas_int lda, const rocblas_int strideA,
-                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *B, const rocblas_int ldb, 
+                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_float_complex *B, const rocblas_int ldb,
                  const rocblas_int strideB, const rocblas_int batch_count)
 {
   return rocsolver_getrs_strided_batched_impl<rocblas_float_complex>(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count);
 }
 
-extern "C" ROCSOLVER_EXPORT rocsolver_status 
+extern "C" ROCSOLVER_EXPORT rocsolver_status
 rocsolver_zgetrs_strided_batched(
                  rocblas_handle handle, const rocblas_operation trans, const rocblas_int n,
                  const rocblas_int nrhs, rocblas_double_complex *A, const rocblas_int lda, const rocblas_int strideA,
-                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *B, const rocblas_int ldb, 
+                 const rocblas_int *ipiv, const rocblas_int strideP, rocblas_double_complex *B, const rocblas_int ldb,
                  const rocblas_int strideB, const rocblas_int batch_count)
 {
   return rocsolver_getrs_strided_batched_impl<rocblas_double_complex>(handle, trans, n, nrhs, A, lda, strideA, ipiv, strideP, B, ldb, strideB, batch_count);
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2.cpp
index 1ed3f0ee..0127cbe0 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2.cpp
@@ -5,14 +5,14 @@
 #include "roclapack_potf2.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_potf2_impl(rocblas_handle handle, const rocblas_fill uplo,    
-                                    const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info) 
-{ 
+rocblas_status rocsolver_potf2_impl(rocblas_handle handle, const rocblas_fill uplo,
+                                    const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !info)
         return rocblas_status_invalid_pointer;
     if (n < 0 || lda < n)
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2.hpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2.hpp
index 4e1c3c91..518d202e 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2.hpp
@@ -18,9 +18,9 @@
 #include "common_device.hpp"
 #include "ideal_sizes.hpp"
 
-template <typename T, typename U> 
-__global__ void sqrtDiagOnward(U A, const rocblas_int shiftA, const rocblas_int strideA, const size_t loc, 
-                               const rocblas_int j, T *res, rocblas_int *info) 
+template <typename T, typename U>
+__global__ void sqrtDiagOnward(U A, const rocblas_int shiftA, const rocblas_int strideA, const size_t loc,
+                               const rocblas_int j, T *res, rocblas_int *info)
 {
     int id = hipBlockIdx_x;
 
@@ -45,10 +45,10 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle,
                                         const rocblas_fill uplo, const rocblas_int n, U A,
                                         const rocblas_int shiftA,
                                         const rocblas_int lda, const rocblas_int strideA,
-                                        rocblas_int *info, const rocblas_int batch_count) 
+                                        rocblas_int *info, const rocblas_int batch_count)
 {
     // quick return
-    if (n == 0 || batch_count == 0) 
+    if (n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     #ifdef batched
@@ -70,7 +70,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle,
     hipMemcpy(d_minone, &h_minone, sizeof(T), hipMemcpyHostToDevice);
 
     //diagonal info in device (device memory workspace to avoid synchronization with CPU)
-    T *pivotGPU; 
+    T *pivotGPU;
     hipMalloc(&pivotGPU, sizeof(T)*batch_count);
 
     hipStream_t stream;
@@ -95,7 +95,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle,
                 rocblas_dot<T>(handle, j, (M + idx2D(0, j, lda)), 1,
                                 (M + idx2D(0, j, lda)), 1, (pivotGPU + b));
             }
-            hipLaunchKernelGGL(sqrtDiagOnward<T>, dim3(batch_count), dim3(1), 0, stream, 
+            hipLaunchKernelGGL(sqrtDiagOnward<T>, dim3(batch_count), dim3(1), 0, stream,
                                A, shiftA, strideA, idx2D(j, j, lda), j, pivotGPU, info);
 
             // Compute elements J+1:N of row J
@@ -103,9 +103,9 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle,
                 for (int b=0;b<batch_count;++b) {
                     M = load_ptr_batch<T>(AA,shiftA,b,strideA);
                     rocblas_gemv<T>(handle, rocblas_operation_transpose, j, n - j - 1,
-                                    d_minone, (M + idx2D(0, j + 1, lda)), lda, 
+                                    d_minone, (M + idx2D(0, j + 1, lda)), lda,
                                     (M + idx2D(0, j, lda)), 1, d_one, (M + idx2D(j, j + 1, lda)), lda);
-                }    
+                }
                 for (int b=0;b<batch_count;++b) {
                     M = load_ptr_batch<T>(AA,shiftA,b,strideA);
                     rocblas_scal<T>(handle, n - j - 1, (pivotGPU + b),
@@ -122,7 +122,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle,
                 rocblas_dot<T>(handle, j, (M + idx2D(j, 0, lda)), lda,
                                 (M + idx2D(j, 0, lda)), lda, (pivotGPU + b));
             }
-            hipLaunchKernelGGL(sqrtDiagOnward<T>, dim3(batch_count), dim3(1), 0, stream, 
+            hipLaunchKernelGGL(sqrtDiagOnward<T>, dim3(batch_count), dim3(1), 0, stream,
                                A, shiftA, strideA, idx2D(j, j, lda), j, pivotGPU, info);
 
             // Compute elements J+1:N of row J
@@ -130,7 +130,7 @@ rocblas_status rocsolver_potf2_template(rocblas_handle handle,
                 for (int b=0;b<batch_count;++b) {
                     M = load_ptr_batch<T>(AA,shiftA,b,strideA);
                     rocblas_gemv<T>(handle, rocblas_operation_none, n - j - 1, j,
-                                    d_minone, (M + idx2D(j + 1, 0, lda)), lda, 
+                                    d_minone, (M + idx2D(j + 1, 0, lda)), lda,
                                     (M + idx2D(j, 0, lda)), lda, d_one, (M + idx2D(j + 1, j, lda)), 1);
                 }
                 for (int b=0;b<batch_count;++b) {
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2_batched.cpp
index 266cfa6a..84c16595 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2_batched.cpp
@@ -6,15 +6,15 @@
 #include "roclapack_potf2.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_potf2_batched_impl(rocblas_handle handle, const rocblas_fill uplo,    
-                                            const rocblas_int n, U A, const rocblas_int lda, 
-                                            rocblas_int* info, const rocblas_int batch_count) 
-{ 
+rocblas_status rocsolver_potf2_batched_impl(rocblas_handle handle, const rocblas_fill uplo,
+                                            const rocblas_int n, U A, const rocblas_int lda,
+                                            rocblas_int* info, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !info)
         return rocblas_status_invalid_pointer;
     if (n < 0 || lda < n || batch_count < 0)
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2_strided_batched.cpp
index 4988f364..4e88e448 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potf2_strided_batched.cpp
@@ -5,15 +5,15 @@
 #include "roclapack_potf2.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_potf2_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo,    
-                                            const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, 
-                                            rocblas_int* info, const rocblas_int batch_count) 
-{ 
+rocblas_status rocsolver_potf2_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo,
+                                            const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
+                                            rocblas_int* info, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !info)
         return rocblas_status_invalid_pointer;
     if (n < 0 || lda < n || batch_count < 0)
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf.cpp
index e0512eed..b8be605f 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf.cpp
@@ -5,14 +5,14 @@
 #include "roclapack_potrf.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_potrf_impl(rocblas_handle handle, const rocblas_fill uplo,    
-                                    const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info) 
-{ 
+rocblas_status rocsolver_potrf_impl(rocblas_handle handle, const rocblas_fill uplo,
+                                    const rocblas_int n, U A, const rocblas_int lda, rocblas_int* info)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !info)
         return rocblas_status_invalid_pointer;
     if (n < 0 || lda < n)
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf.hpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf.hpp
index 1f1c6650..aef657d4 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf.hpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf.hpp
@@ -19,12 +19,12 @@
 #include "ideal_sizes.hpp"
 #include "roclapack_potf2.hpp"
 
-inline __global__ void chk_positive(rocblas_int *iinfo, rocblas_int *info, int j) 
+inline __global__ void chk_positive(rocblas_int *iinfo, rocblas_int *info, int j)
 {
     int id = hipBlockIdx_x;
 
     if (info[id] == 0 && iinfo[id] > 0)
-            info[id] = iinfo[id] + j;   
+            info[id] = iinfo[id] + j;
 }
 
 template <typename T, typename U>
@@ -32,14 +32,14 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle,
                                         const rocblas_fill uplo, const rocblas_int n, U A,
                                         const rocblas_int shiftA,
                                         const rocblas_int lda, const rocblas_int strideA,
-                                        rocblas_int *info, const rocblas_int batch_count) 
+                                        rocblas_int *info, const rocblas_int batch_count)
 {
     // quick return
-    if (n == 0 || batch_count == 0) 
+    if (n == 0 || batch_count == 0)
         return rocblas_status_success;
 
     // if the matrix is small, use the unblocked (BLAS-levelII) variant of the algorithm
-    if (n < POTRF_POTF2_SWITCHSIZE) 
+    if (n < POTRF_POTF2_SWITCHSIZE)
         return rocsolver_potf2_template<T>(handle, uplo, n, A, shiftA, lda, strideA, info, batch_count);
 
     #ifdef batched
@@ -61,7 +61,7 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle,
     hipMemcpy(d_minone, &h_minone, sizeof(T), hipMemcpyHostToDevice);
 
     //info in device (device memory workspace to avoid synchronization with CPU)
-    rocblas_int *iinfo; 
+    rocblas_int *iinfo;
     hipMalloc(&iinfo, sizeof(rocblas_int)*batch_count);
 
     hipStream_t stream;
@@ -81,14 +81,14 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle,
 
     if (uplo == rocblas_fill_upper) { // Compute the Cholesky factorization A = U'*U.
         for (rocblas_int j = 0; j < n; j += POTRF_POTF2_SWITCHSIZE) {
-            // Factor diagonal and subdiagonal blocks 
+            // Factor diagonal and subdiagonal blocks
             jb = min(n - j, POTRF_POTF2_SWITCHSIZE);  //number of columns in the block
             hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,iinfo,batch_count,0);
             rocsolver_potf2_template<T>(handle, uplo, jb, A, shiftA + idx2D(j, j, lda), lda, strideA, iinfo, batch_count);
-            
+
             // test for non-positive-definiteness.
             hipLaunchKernelGGL(chk_positive,gridReset,threads,0,stream,iinfo,info,j);
-            
+
             if (j + jb < n) {
                 // update trailing submatrix
                 for (int b=0;b<batch_count;++b) {
@@ -98,7 +98,7 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle,
                              (M + idx2D(j, j, lda)), lda, (M + idx2D(j, j + jb, lda)), lda);
                 }
 
-                // *** GEMM MUST BE REPLACED BY SYRK ONCE IT IS AVAILABLE IN ROCBLAS ****                
+                // *** GEMM MUST BE REPLACED BY SYRK ONCE IT IS AVAILABLE IN ROCBLAS ****
                 for (int b=0;b<batch_count;++b) {
                     M = load_ptr_batch<T>(AA,shiftA,b,strideA);
                     rocblas_gemm(handle, rocblas_operation_transpose, rocblas_operation_none,
@@ -112,14 +112,14 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle,
 
     } else { // Compute the Cholesky factorization A = L'*L.
         for (rocblas_int j = 0; j < n; j += POTRF_POTF2_SWITCHSIZE) {
-            // Factor diagonal and subdiagonal blocks 
+            // Factor diagonal and subdiagonal blocks
             jb = min(n - j, POTRF_POTF2_SWITCHSIZE);  //number of columns in the block
             hipLaunchKernelGGL(reset_info,gridReset,threads,0,stream,iinfo,batch_count,0);
             rocsolver_potf2_template<T>(handle, uplo, jb, A, shiftA + idx2D(j, j, lda), lda, strideA, iinfo, batch_count);
-            
+
             // test for non-positive-definiteness.
             hipLaunchKernelGGL(chk_positive,gridReset,threads,0,stream,iinfo,info,j);
-            
+
             if (j + jb < n) {
                 // update trailing submatrix
                 for (int b=0;b<batch_count;++b) {
@@ -129,7 +129,7 @@ rocblas_status rocsolver_potrf_template(rocblas_handle handle,
                              (M + idx2D(j, j, lda)), lda, (M + idx2D(j + jb, j, lda)), lda);
                 }
 
-                // *** GEMM MUST BE REPLACED BY SYRK ONCE IT IS AVAILABLE IN ROCBLAS ****                
+                // *** GEMM MUST BE REPLACED BY SYRK ONCE IT IS AVAILABLE IN ROCBLAS ****
                 for (int b=0;b<batch_count;++b) {
                     M = load_ptr_batch<T>(AA,shiftA,b,strideA);
                     rocblas_gemm(handle, rocblas_operation_none, rocblas_operation_transpose,
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf_batched.cpp
index 7ac5061e..06dda30c 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf_batched.cpp
@@ -6,15 +6,15 @@
 #include "roclapack_potrf.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_potrf_batched_impl(rocblas_handle handle, const rocblas_fill uplo,    
-                                            const rocblas_int n, U A, const rocblas_int lda, 
-                                            rocblas_int* info, const rocblas_int batch_count) 
-{ 
+rocblas_status rocsolver_potrf_batched_impl(rocblas_handle handle, const rocblas_fill uplo,
+                                            const rocblas_int n, U A, const rocblas_int lda,
+                                            rocblas_int* info, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !info)
         return rocblas_status_invalid_pointer;
     if (n < 0 || lda < n || batch_count < 0)
diff --git a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf_strided_batched.cpp b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf_strided_batched.cpp
index 2e49ab4b..6c081fc4 100644
--- a/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf_strided_batched.cpp
+++ b/ROCm_Libraries/rocSOLVER/src/lapack/roclapack_potrf_strided_batched.cpp
@@ -5,15 +5,15 @@
 #include "roclapack_potrf.hpp"
 
 template <typename T, typename U>
-rocblas_status rocsolver_potrf_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo,    
-                                            const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA, 
-                                            rocblas_int* info, const rocblas_int batch_count) 
-{ 
+rocblas_status rocsolver_potrf_strided_batched_impl(rocblas_handle handle, const rocblas_fill uplo,
+                                            const rocblas_int n, U A, const rocblas_int lda, const rocblas_int strideA,
+                                            rocblas_int* info, const rocblas_int batch_count)
+{
     if(!handle)
         return rocblas_status_invalid_handle;
-    
-    //logging is missing ???    
-    
+
+    //logging is missing ???
+
     if (!A || !info)
         return rocblas_status_invalid_pointer;
     if (n < 0 || lda < n || batch_count < 0)
diff --git a/ROCm_Libraries/rocSOLVER/src/rocsolver-config.cmake.in b/ROCm_Libraries/rocSOLVER/src/rocsolver-config.cmake.in
index 970adc43..8b6304e0 100644
--- a/ROCm_Libraries/rocSOLVER/src/rocsolver-config.cmake.in
+++ b/ROCm_Libraries/rocSOLVER/src/rocsolver-config.cmake.in
@@ -1,6 +1,6 @@
 
 @PACKAGE_INIT@
-    
+
 set_and_check(rocsolver_INCLUDE_DIR @PACKAGE_INCLUDE_INSTALL_DIR@)
 
 set_and_check(rocsolver_INCLUDE_DIRS @PACKAGE_INCLUDE_INSTALL_DIR@)
diff --git a/ROCm_Libraries/rocSPARSE/Doxyfile b/ROCm_Libraries/rocSPARSE/Doxyfile
index e7a87a6b..0f6ee32e 100644
--- a/ROCm_Libraries/rocSPARSE/Doxyfile
+++ b/ROCm_Libraries/rocSPARSE/Doxyfile
@@ -162,7 +162,7 @@ FULL_PATH_NAMES        = YES
 # will be relative from the directory where doxygen is started.
 # This tag requires that the tag FULL_PATH_NAMES is set to YES.
 
-STRIP_FROM_PATH        = 
+STRIP_FROM_PATH        =
 
 # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
 # path mentioned in the documentation of a class, which tells the reader which
@@ -171,7 +171,7 @@ STRIP_FROM_PATH        =
 # specify the list of include paths that are normally passed to the compiler
 # using the -I flag.
 
-STRIP_FROM_INC_PATH    = 
+STRIP_FROM_INC_PATH    =
 
 # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
 # less readable) file names. This can be useful is your file systems doesn't
@@ -244,7 +244,7 @@ ALIASES                =
 # A mapping has the form "name=value". For example adding "class=itcl::class"
 # will allow you to use the command class in the itcl::class meaning.
 
-TCL_SUBST              = 
+TCL_SUBST              =
 
 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
@@ -291,7 +291,7 @@ OPTIMIZE_OUTPUT_VHDL   = NO
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
 # the files are not read by doxygen.
 
-EXTENSION_MAPPING      = 
+EXTENSION_MAPPING      =
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
@@ -641,7 +641,7 @@ GENERATE_DEPRECATEDLIST= YES
 # sections, marked by \if <section_label> ... \endif and \cond <section_label>
 # ... \endcond blocks.
 
-ENABLED_SECTIONS       = 
+ENABLED_SECTIONS       =
 
 # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
 # initial value of a variable or macro / define can have for it to appear in the
@@ -683,7 +683,7 @@ SHOW_NAMESPACES        = YES
 # by doxygen. Whatever the program writes to standard output is used as the file
 # version. For an example see the documentation.
 
-FILE_VERSION_FILTER    = 
+FILE_VERSION_FILTER    =
 
 # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
 # by doxygen. The layout file controls the global structure of the generated
@@ -696,7 +696,7 @@ FILE_VERSION_FILTER    =
 # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
 # tag is left empty.
 
-LAYOUT_FILE            = 
+LAYOUT_FILE            =
 
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
@@ -706,7 +706,7 @@ LAYOUT_FILE            =
 # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
 # search path. See also \cite for info how to create references.
 
-CITE_BIB_FILES         = 
+CITE_BIB_FILES         =
 
 #---------------------------------------------------------------------------
 # Configuration options related to warning and progress messages
@@ -765,7 +765,7 @@ WARN_FORMAT            = "$file:$line: $text"
 # messages should be written. If left blank the output is written to standard
 # error (stderr).
 
-WARN_LOGFILE           = 
+WARN_LOGFILE           =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the input files
@@ -781,7 +781,7 @@ INPUT                  = ROCm_Libraries/rocSPARSE/src/modules.dox \
                          ROCm_Libraries/rocSPARSE/src/rocsparse-functions_sed.h \
                          ROCm_Libraries/rocSPARSE/src/rocsparse-auxiliary_sed.h \
                          ROCm_Libraries/rocSPARSE/src/rocsparse-types.h \
-			 ROCm_Libraries/rocSPARSE/src/rocsparse.h 
+			 ROCm_Libraries/rocSPARSE/src/rocsparse.h
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -862,7 +862,7 @@ RECURSIVE              = NO
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                = 
+EXCLUDE                =
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -878,7 +878,7 @@ EXCLUDE_SYMLINKS       = NO
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*
 
-EXCLUDE_PATTERNS       = 
+EXCLUDE_PATTERNS       =
 
 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
@@ -889,13 +889,13 @@ EXCLUDE_PATTERNS       =
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories use the pattern */test/*
 
-EXCLUDE_SYMBOLS        = 
+EXCLUDE_SYMBOLS        =
 
 # The EXAMPLE_PATH tag can be used to specify one or more files or directories
 # that contain example code fragments that are included (see the \include
 # command).
 
-EXAMPLE_PATH           = 
+EXAMPLE_PATH           =
 
 # If the value of the EXAMPLE_PATH tag contains directories, you can use the
 # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
@@ -915,7 +915,7 @@ EXAMPLE_RECURSIVE      = NO
 # that contain images that are to be included in the documentation (see the
 # \image command).
 
-IMAGE_PATH             = 
+IMAGE_PATH             =
 
 # The INPUT_FILTER tag can be used to specify a program that doxygen should
 # invoke to filter for each input file. Doxygen will invoke the filter program
@@ -932,7 +932,7 @@ IMAGE_PATH             =
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
 
-INPUT_FILTER           = 
+INPUT_FILTER           =
 
 # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
 # basis. Doxygen will compare the file name with each pattern and apply the
@@ -941,7 +941,7 @@ INPUT_FILTER           =
 # filters are used. If the FILTER_PATTERNS tag is empty or if none of the
 # patterns match the file name, INPUT_FILTER is applied.
 
-FILTER_PATTERNS        = 
+FILTER_PATTERNS        =
 
 # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
 # INPUT_FILTER) will also be used to filter the input files that are used for
@@ -956,7 +956,7 @@ FILTER_SOURCE_FILES    = NO
 # *.ext= (so without naming a filter).
 # This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
 
-FILTER_SOURCE_PATTERNS = 
+FILTER_SOURCE_PATTERNS =
 
 # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
 # is part of the input, its contents will be placed on the main page
@@ -1068,7 +1068,7 @@ CLANG_ASSISTED_PARSING = NO
 # specified with INPUT and INCLUDE_PATH.
 # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
 
-CLANG_OPTIONS          = 
+CLANG_OPTIONS          =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
@@ -1094,7 +1094,7 @@ COLS_IN_ALPHA_INDEX    = 5
 # while generating the index headers.
 # This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
 
-IGNORE_PREFIX          = 
+IGNORE_PREFIX          =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the HTML output
@@ -1138,7 +1138,7 @@ HTML_FILE_EXTENSION    = .html
 # of the possible markers and block names see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_HEADER            = 
+HTML_HEADER            =
 
 # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
 # generated HTML page. If the tag is left blank doxygen will generate a standard
@@ -1148,7 +1148,7 @@ HTML_HEADER            =
 # that doxygen normally uses.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_FOOTER            = 
+HTML_FOOTER            =
 
 # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
 # sheet that is used by each HTML page. It can be used to fine-tune the look of
@@ -1160,7 +1160,7 @@ HTML_FOOTER            =
 # obsolete.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_STYLESHEET        = 
+HTML_STYLESHEET        =
 
 # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # cascading style sheets that are included after the standard style sheets
@@ -1173,7 +1173,7 @@ HTML_STYLESHEET        =
 # list). For an example see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_EXTRA_STYLESHEET  = 
+HTML_EXTRA_STYLESHEET  =
 
 # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the HTML output directory. Note
@@ -1183,7 +1183,7 @@ HTML_EXTRA_STYLESHEET  =
 # files will be copied as-is; there are no commands or markers available.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_EXTRA_FILES       = 
+HTML_EXTRA_FILES       =
 
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
@@ -1312,7 +1312,7 @@ GENERATE_HTMLHELP      = NO
 # written to the html output directory.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-CHM_FILE               = 
+CHM_FILE               =
 
 # The HHC_LOCATION tag can be used to specify the location (absolute path
 # including file name) of the HTML help compiler (hhc.exe). If non-empty,
@@ -1320,7 +1320,7 @@ CHM_FILE               =
 # The file has to be specified with full path.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-HHC_LOCATION           = 
+HHC_LOCATION           =
 
 # The GENERATE_CHI flag controls if a separate .chi index file is generated
 # (YES) or that it should be included in the master .chm file (NO).
@@ -1333,7 +1333,7 @@ GENERATE_CHI           = NO
 # and project file content.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-CHM_INDEX_ENCODING     = 
+CHM_INDEX_ENCODING     =
 
 # The BINARY_TOC flag controls whether a binary table of contents is generated
 # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
@@ -1364,7 +1364,7 @@ GENERATE_QHP           = NO
 # the HTML output folder.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QCH_FILE               = 
+QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
@@ -1389,7 +1389,7 @@ QHP_VIRTUAL_FOLDER     = doc
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_CUST_FILTER_NAME   = 
+QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
@@ -1397,21 +1397,21 @@ QHP_CUST_FILTER_NAME   =
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_CUST_FILTER_ATTRS  = 
+QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
 # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_SECT_FILTER_ATTRS  = 
+QHP_SECT_FILTER_ATTRS  =
 
 # The QHG_LOCATION tag can be used to specify the location of Qt's
 # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
 # generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHG_LOCATION           = 
+QHG_LOCATION           =
 
 # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
 # generated, together with the HTML files, they form an Eclipse help plugin. To
@@ -1553,7 +1553,7 @@ MATHJAX_EXTENSIONS     = TeX/AMSmath TeX/AMSsymbols
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_CODEFILE       = 
+MATHJAX_CODEFILE       =
 
 # When the SEARCHENGINE tag is enabled doxygen will generate a search box for
 # the HTML output. The underlying search engine uses javascript and DHTML and
@@ -1613,7 +1613,7 @@ EXTERNAL_SEARCH        = NO
 # Searching" for details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-SEARCHENGINE_URL       = 
+SEARCHENGINE_URL       =
 
 # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
 # search data is written to a file for indexing by an external tool. With the
@@ -1629,7 +1629,7 @@ SEARCHDATA_FILE        = searchdata.xml
 # projects and redirect the results back to the right project.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-EXTERNAL_SEARCH_ID     = 
+EXTERNAL_SEARCH_ID     =
 
 # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
 # projects other than the one defined by this configuration file, but that are
@@ -1639,7 +1639,7 @@ EXTERNAL_SEARCH_ID     =
 # EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-EXTRA_SEARCH_MAPPINGS  = 
+EXTRA_SEARCH_MAPPINGS  =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the LaTeX output
@@ -1703,7 +1703,7 @@ PAPER_TYPE             = a4
 # If left blank no extra packages will be included.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-EXTRA_PACKAGES         = 
+EXTRA_PACKAGES         =
 
 # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
 # generated LaTeX document. The header should contain everything until the first
@@ -1719,7 +1719,7 @@ EXTRA_PACKAGES         =
 # to HTML_HEADER.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_HEADER           = 
+LATEX_HEADER           =
 
 # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
 # generated LaTeX document. The footer should contain everything after the last
@@ -1730,7 +1730,7 @@ LATEX_HEADER           =
 # Note: Only use a user-defined footer if you know what you are doing!
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_FOOTER           = 
+LATEX_FOOTER           =
 
 # The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # LaTeX style sheets that are included after the standard style sheets created
@@ -1741,7 +1741,7 @@ LATEX_FOOTER           =
 # list).
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_EXTRA_STYLESHEET = 
+LATEX_EXTRA_STYLESHEET =
 
 # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the LATEX_OUTPUT output
@@ -1749,7 +1749,7 @@ LATEX_EXTRA_STYLESHEET =
 # markers available.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_EXTRA_FILES      = 
+LATEX_EXTRA_FILES      =
 
 # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
 # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
@@ -1851,14 +1851,14 @@ RTF_HYPERLINKS         = NO
 # default style sheet that doxygen normally uses.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
-RTF_STYLESHEET_FILE    = 
+RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
 # similar to doxygen's config file. A template extensions file can be generated
 # using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
-RTF_EXTENSIONS_FILE    = 
+RTF_EXTENSIONS_FILE    =
 
 # If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
 # with syntax highlighting in the RTF output.
@@ -1903,7 +1903,7 @@ MAN_EXTENSION          = .3
 # MAN_EXTENSION with the initial . removed.
 # This tag requires that the tag GENERATE_MAN is set to YES.
 
-MAN_SUBDIR             = 
+MAN_SUBDIR             =
 
 # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
 # will generate one additional man file for each entity documented in the real
@@ -1922,7 +1922,7 @@ MAN_LINKS              = NO
 # captures the structure of the code including all documentation.
 # The default value is: NO.
 
-GENERATE_XML           = YES 
+GENERATE_XML           = YES
 
 # The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
 # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
@@ -2016,7 +2016,7 @@ PERLMOD_PRETTY         = YES
 # overwrite each other's variables.
 # This tag requires that the tag GENERATE_PERLMOD is set to YES.
 
-PERLMOD_MAKEVAR_PREFIX = 
+PERLMOD_MAKEVAR_PREFIX =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the preprocessor
@@ -2057,7 +2057,7 @@ SEARCH_INCLUDES        = YES
 # preprocessor.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
-INCLUDE_PATH           = 
+INCLUDE_PATH           =
 
 # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
 # patterns (like *.h and *.hpp) to filter out the header-files in the
@@ -2065,7 +2065,7 @@ INCLUDE_PATH           =
 # used.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-INCLUDE_FILE_PATTERNS  = 
+INCLUDE_FILE_PATTERNS  =
 
 # The PREDEFINED tag can be used to specify one or more macro names that are
 # defined before the preprocessor is started (similar to the -D option of e.g.
@@ -2075,7 +2075,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = 
+PREDEFINED             =
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
@@ -2084,7 +2084,7 @@ PREDEFINED             =
 # definition found in the source code.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-EXPAND_AS_DEFINED      = 
+EXPAND_AS_DEFINED      =
 
 # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
 # remove all references to function-like macros that are alone on a line, have
@@ -2113,13 +2113,13 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.
 
-TAGFILES               = 
+TAGFILES               =
 
 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to
 # external documentation" for more information about the usage of tag files.
 
-GENERATE_TAGFILE       = 
+GENERATE_TAGFILE       =
 
 # If the ALLEXTERNALS tag is set to YES, all external class will be listed in
 # the class index. If set to NO, only the inherited external classes will be
@@ -2168,14 +2168,14 @@ CLASS_DIAGRAMS         = NO
 # the mscgen tool resides. If left empty the tool is assumed to be found in the
 # default search path.
 
-MSCGEN_PATH            = 
+MSCGEN_PATH            =
 
 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
 # If left empty dia is assumed to be found in the default search path.
 
-DIA_PATH               = 
+DIA_PATH               =
 
 # If set to YES the inheritance and collaboration graphs will hide inheritance
 # and usage relations if the target is undocumented or is not a class.
@@ -2224,7 +2224,7 @@ DOT_FONTSIZE           = 10
 # the path where dot can find it using this tag.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_FONTPATH           = 
+DOT_FONTPATH           =
 
 # If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
 # each documented class showing the direct and indirect inheritance relations.
@@ -2368,26 +2368,26 @@ INTERACTIVE_SVG        = NO
 # found. If left blank, it is assumed the dot tool can be found in the path.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_PATH               = 
+DOT_PATH               =
 
 # The DOTFILE_DIRS tag can be used to specify one or more directories that
 # contain dot files that are included in the documentation (see the \dotfile
 # command).
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOTFILE_DIRS           = 
+DOTFILE_DIRS           =
 
 # The MSCFILE_DIRS tag can be used to specify one or more directories that
 # contain msc files that are included in the documentation (see the \mscfile
 # command).
 
-MSCFILE_DIRS           = 
+MSCFILE_DIRS           =
 
 # The DIAFILE_DIRS tag can be used to specify one or more directories that
 # contain dia files that are included in the documentation (see the \diafile
 # command).
 
-DIAFILE_DIRS           = 
+DIAFILE_DIRS           =
 
 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
 # path where java can find the plantuml.jar file. If left blank, it is assumed
@@ -2395,12 +2395,12 @@ DIAFILE_DIRS           =
 # generate a warning when it encounters a \startuml command in this case and
 # will not generate output for the diagram.
 
-PLANTUML_JAR_PATH      = 
+PLANTUML_JAR_PATH      =
 
 # When using plantuml, the specified paths are searched for files specified by
 # the !include statement in a plantuml block.
 
-PLANTUML_INCLUDE_PATH  = 
+PLANTUML_INCLUDE_PATH  =
 
 # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
 # that will be shown in the graph. If the number of nodes in a graph becomes
diff --git a/ROCm_Libraries/rocr/Doxyfile b/ROCm_Libraries/rocr/Doxyfile
index ecf24e6d..3d39ca34 100644
--- a/ROCm_Libraries/rocr/Doxyfile
+++ b/ROCm_Libraries/rocr/Doxyfile
@@ -164,7 +164,7 @@ FULL_PATH_NAMES        = YES
 # will be relative from the directory where doxygen is started.
 # This tag requires that the tag FULL_PATH_NAMES is set to YES.
 
-STRIP_FROM_PATH        = 
+STRIP_FROM_PATH        =
 
 # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
 # path mentioned in the documentation of a class, which tells the reader which
@@ -173,7 +173,7 @@ STRIP_FROM_PATH        =
 # specify the list of include paths that are normally passed to the compiler
 # using the -I flag.
 
-STRIP_FROM_INC_PATH    = 
+STRIP_FROM_INC_PATH    =
 
 # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
 # less readable) file names. This can be useful is your file systems doesn't
@@ -240,13 +240,13 @@ TAB_SIZE               = 4
 # "Side Effects:". You can put \n's in the value part of an alias to insert
 # newlines.
 
-ALIASES                = 
+ALIASES                =
 
 # This tag can be used to specify a number of word-keyword mappings (TCL only).
 # A mapping has the form "name=value". For example adding "class=itcl::class"
 # will allow you to use the command class in the itcl::class meaning.
 
-TCL_SUBST              = 
+TCL_SUBST              =
 
 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
@@ -295,7 +295,7 @@ OPTIMIZE_OUTPUT_VHDL   = NO
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
 # the files are not read by doxygen.
 
-EXTENSION_MAPPING      = 
+EXTENSION_MAPPING      =
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
@@ -649,7 +649,7 @@ GENERATE_DEPRECATEDLIST= YES
 # sections, marked by \if <section_label> ... \endif and \cond <section_label>
 # ... \endcond blocks.
 
-ENABLED_SECTIONS       = 
+ENABLED_SECTIONS       =
 
 # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
 # initial value of a variable or macro / define can have for it to appear in the
@@ -691,7 +691,7 @@ SHOW_NAMESPACES        = YES
 # by doxygen. Whatever the program writes to standard output is used as the file
 # version. For an example see the documentation.
 
-FILE_VERSION_FILTER    = 
+FILE_VERSION_FILTER    =
 
 # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
 # by doxygen. The layout file controls the global structure of the generated
@@ -704,7 +704,7 @@ FILE_VERSION_FILTER    =
 # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
 # tag is left empty.
 
-LAYOUT_FILE            = 
+LAYOUT_FILE            =
 
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
@@ -773,7 +773,7 @@ WARN_FORMAT            = "$file:$line: $text"
 # messages should be written. If left blank the output is written to standard
 # error (stderr).
 
-WARN_LOGFILE           = 
+WARN_LOGFILE           =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the input files
@@ -868,7 +868,7 @@ RECURSIVE              = NO
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                = 
+EXCLUDE                =
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -884,7 +884,7 @@ EXCLUDE_SYMLINKS       = NO
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*
 
-EXCLUDE_PATTERNS       = 
+EXCLUDE_PATTERNS       =
 
 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
@@ -895,13 +895,13 @@ EXCLUDE_PATTERNS       =
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories use the pattern */test/*
 
-EXCLUDE_SYMBOLS        = 
+EXCLUDE_SYMBOLS        =
 
 # The EXAMPLE_PATH tag can be used to specify one or more files or directories
 # that contain example code fragments that are included (see the \include
 # command).
 
-EXAMPLE_PATH           = 
+EXAMPLE_PATH           =
 
 # If the value of the EXAMPLE_PATH tag contains directories, you can use the
 # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
@@ -921,7 +921,7 @@ EXAMPLE_RECURSIVE      = NO
 # that contain images that are to be included in the documentation (see the
 # \image command).
 
-IMAGE_PATH             = 
+IMAGE_PATH             =
 
 # The INPUT_FILTER tag can be used to specify a program that doxygen should
 # invoke to filter for each input file. Doxygen will invoke the filter program
@@ -938,7 +938,7 @@ IMAGE_PATH             =
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
 
-INPUT_FILTER           = 
+INPUT_FILTER           =
 
 # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
 # basis. Doxygen will compare the file name with each pattern and apply the
@@ -947,7 +947,7 @@ INPUT_FILTER           =
 # filters are used. If the FILTER_PATTERNS tag is empty or if none of the
 # patterns match the file name, INPUT_FILTER is applied.
 
-FILTER_PATTERNS        = 
+FILTER_PATTERNS        =
 
 # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
 # INPUT_FILTER) will also be used to filter the input files that are used for
@@ -962,7 +962,7 @@ FILTER_SOURCE_FILES    = NO
 # *.ext= (so without naming a filter).
 # This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
 
-FILTER_SOURCE_PATTERNS = 
+FILTER_SOURCE_PATTERNS =
 
 # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
 # is part of the input, its contents will be placed on the main page
@@ -1074,7 +1074,7 @@ CLANG_ASSISTED_PARSING = NO
 # specified with INPUT and INCLUDE_PATH.
 # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
 
-CLANG_OPTIONS          = 
+CLANG_OPTIONS          =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
@@ -1100,7 +1100,7 @@ COLS_IN_ALPHA_INDEX    = 5
 # while generating the index headers.
 # This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
 
-IGNORE_PREFIX          = 
+IGNORE_PREFIX          =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the HTML output
@@ -1145,7 +1145,7 @@ HTML_FILE_EXTENSION    = .html
 # of the possible markers and block names see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_HEADER            = 
+HTML_HEADER            =
 
 # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
 # generated HTML page. If the tag is left blank doxygen will generate a standard
@@ -1155,7 +1155,7 @@ HTML_HEADER            =
 # that doxygen normally uses.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_FOOTER            = 
+HTML_FOOTER            =
 
 # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
 # sheet that is used by each HTML page. It can be used to fine-tune the look of
@@ -1167,7 +1167,7 @@ HTML_FOOTER            =
 # obsolete.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_STYLESHEET        = 
+HTML_STYLESHEET        =
 
 # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # cascading style sheets that are included after the standard style sheets
@@ -1180,7 +1180,7 @@ HTML_STYLESHEET        =
 # list). For an example see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_EXTRA_STYLESHEET  = 
+HTML_EXTRA_STYLESHEET  =
 
 # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the HTML output directory. Note
@@ -1190,7 +1190,7 @@ HTML_EXTRA_STYLESHEET  =
 # files will be copied as-is; there are no commands or markers available.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_EXTRA_FILES       = 
+HTML_EXTRA_FILES       =
 
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
@@ -1319,7 +1319,7 @@ GENERATE_HTMLHELP      = NO
 # written to the html output directory.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-CHM_FILE               = 
+CHM_FILE               =
 
 # The HHC_LOCATION tag can be used to specify the location (absolute path
 # including file name) of the HTML help compiler (hhc.exe). If non-empty,
@@ -1327,7 +1327,7 @@ CHM_FILE               =
 # The file has to be specified with full path.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-HHC_LOCATION           = 
+HHC_LOCATION           =
 
 # The GENERATE_CHI flag controls if a separate .chi index file is generated
 # (YES) or that it should be included in the master .chm file (NO).
@@ -1340,7 +1340,7 @@ GENERATE_CHI           = NO
 # and project file content.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
-CHM_INDEX_ENCODING     = 
+CHM_INDEX_ENCODING     =
 
 # The BINARY_TOC flag controls whether a binary table of contents is generated
 # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
@@ -1371,7 +1371,7 @@ GENERATE_QHP           = NO
 # the HTML output folder.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QCH_FILE               = 
+QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
@@ -1396,7 +1396,7 @@ QHP_VIRTUAL_FOLDER     = doc
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_CUST_FILTER_NAME   = 
+QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
@@ -1404,21 +1404,21 @@ QHP_CUST_FILTER_NAME   =
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_CUST_FILTER_ATTRS  = 
+QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
 # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHP_SECT_FILTER_ATTRS  = 
+QHP_SECT_FILTER_ATTRS  =
 
 # The QHG_LOCATION tag can be used to specify the location of Qt's
 # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
 # generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
-QHG_LOCATION           = 
+QHG_LOCATION           =
 
 # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
 # generated, together with the HTML files, they form an Eclipse help plugin. To
@@ -1551,7 +1551,7 @@ MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
 # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_EXTENSIONS     = 
+MATHJAX_EXTENSIONS     =
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
@@ -1559,7 +1559,7 @@ MATHJAX_EXTENSIONS     =
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_CODEFILE       = 
+MATHJAX_CODEFILE       =
 
 # When the SEARCHENGINE tag is enabled doxygen will generate a search box for
 # the HTML output. The underlying search engine uses javascript and DHTML and
@@ -1619,7 +1619,7 @@ EXTERNAL_SEARCH        = NO
 # Searching" for details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-SEARCHENGINE_URL       = 
+SEARCHENGINE_URL       =
 
 # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
 # search data is written to a file for indexing by an external tool. With the
@@ -1635,7 +1635,7 @@ SEARCHDATA_FILE        = searchdata.xml
 # projects and redirect the results back to the right project.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-EXTERNAL_SEARCH_ID     = 
+EXTERNAL_SEARCH_ID     =
 
 # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
 # projects other than the one defined by this configuration file, but that are
@@ -1645,7 +1645,7 @@ EXTERNAL_SEARCH_ID     =
 # EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
-EXTRA_SEARCH_MAPPINGS  = 
+EXTRA_SEARCH_MAPPINGS  =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the LaTeX output
@@ -1709,7 +1709,7 @@ PAPER_TYPE             = a4
 # If left blank no extra packages will be included.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-EXTRA_PACKAGES         = 
+EXTRA_PACKAGES         =
 
 # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
 # generated LaTeX document. The header should contain everything until the first
@@ -1725,7 +1725,7 @@ EXTRA_PACKAGES         =
 # to HTML_HEADER.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_HEADER           = 
+LATEX_HEADER           =
 
 # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
 # generated LaTeX document. The footer should contain everything after the last
@@ -1736,7 +1736,7 @@ LATEX_HEADER           =
 # Note: Only use a user-defined footer if you know what you are doing!
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_FOOTER           = 
+LATEX_FOOTER           =
 
 # The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
 # LaTeX style sheets that are included after the standard style sheets created
@@ -1747,7 +1747,7 @@ LATEX_FOOTER           =
 # list).
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_EXTRA_STYLESHEET = 
+LATEX_EXTRA_STYLESHEET =
 
 # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the LATEX_OUTPUT output
@@ -1755,7 +1755,7 @@ LATEX_EXTRA_STYLESHEET =
 # markers available.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_EXTRA_FILES      = 
+LATEX_EXTRA_FILES      =
 
 # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
 # prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
@@ -1855,14 +1855,14 @@ RTF_HYPERLINKS         = NO
 # default style sheet that doxygen normally uses.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
-RTF_STYLESHEET_FILE    = 
+RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
 # similar to doxygen's config file. A template extensions file can be generated
 # using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
-RTF_EXTENSIONS_FILE    = 
+RTF_EXTENSIONS_FILE    =
 
 # If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
 # with syntax highlighting in the RTF output.
@@ -1907,7 +1907,7 @@ MAN_EXTENSION          = .3
 # MAN_EXTENSION with the initial . removed.
 # This tag requires that the tag GENERATE_MAN is set to YES.
 
-MAN_SUBDIR             = 
+MAN_SUBDIR             =
 
 # If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
 # will generate one additional man file for each entity documented in the real
@@ -2019,7 +2019,7 @@ PERLMOD_PRETTY         = YES
 # overwrite each other's variables.
 # This tag requires that the tag GENERATE_PERLMOD is set to YES.
 
-PERLMOD_MAKEVAR_PREFIX = 
+PERLMOD_MAKEVAR_PREFIX =
 
 #---------------------------------------------------------------------------
 # Configuration options related to the preprocessor
@@ -2060,7 +2060,7 @@ SEARCH_INCLUDES        = YES
 # preprocessor.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
-INCLUDE_PATH           = 
+INCLUDE_PATH           =
 
 # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
 # patterns (like *.h and *.hpp) to filter out the header-files in the
@@ -2068,7 +2068,7 @@ INCLUDE_PATH           =
 # used.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-INCLUDE_FILE_PATTERNS  = 
+INCLUDE_FILE_PATTERNS  =
 
 # The PREDEFINED tag can be used to specify one or more macro names that are
 # defined before the preprocessor is started (similar to the -D option of e.g.
@@ -2078,7 +2078,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = 
+PREDEFINED             =
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
@@ -2087,7 +2087,7 @@ PREDEFINED             =
 # definition found in the source code.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-EXPAND_AS_DEFINED      = 
+EXPAND_AS_DEFINED      =
 
 # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
 # remove all references to function-like macros that are alone on a line, have
@@ -2116,13 +2116,13 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.
 
-TAGFILES               = 
+TAGFILES               =
 
 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to
 # external documentation" for more information about the usage of tag files.
 
-GENERATE_TAGFILE       = 
+GENERATE_TAGFILE       =
 
 # If the ALLEXTERNALS tag is set to YES, all external class will be listed in
 # the class index. If set to NO, only the inherited external classes will be
@@ -2171,14 +2171,14 @@ CLASS_DIAGRAMS         = NO
 # the mscgen tool resides. If left empty the tool is assumed to be found in the
 # default search path.
 
-MSCGEN_PATH            = 
+MSCGEN_PATH            =
 
 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
 # If left empty dia is assumed to be found in the default search path.
 
-DIA_PATH               = 
+DIA_PATH               =
 
 # If set to YES the inheritance and collaboration graphs will hide inheritance
 # and usage relations if the target is undocumented or is not a class.
@@ -2227,7 +2227,7 @@ DOT_FONTSIZE           = 10
 # the path where dot can find it using this tag.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_FONTPATH           = 
+DOT_FONTPATH           =
 
 # If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
 # each documented class showing the direct and indirect inheritance relations.
@@ -2371,26 +2371,26 @@ INTERACTIVE_SVG        = NO
 # found. If left blank, it is assumed the dot tool can be found in the path.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOT_PATH               = 
+DOT_PATH               =
 
 # The DOTFILE_DIRS tag can be used to specify one or more directories that
 # contain dot files that are included in the documentation (see the \dotfile
 # command).
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DOTFILE_DIRS           = 
+DOTFILE_DIRS           =
 
 # The MSCFILE_DIRS tag can be used to specify one or more directories that
 # contain msc files that are included in the documentation (see the \mscfile
 # command).
 
-MSCFILE_DIRS           = 
+MSCFILE_DIRS           =
 
 # The DIAFILE_DIRS tag can be used to specify one or more directories that
 # contain dia files that are included in the documentation (see the \diafile
 # command).
 
-DIAFILE_DIRS           = 
+DIAFILE_DIRS           =
 
 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
 # path where java can find the plantuml.jar file. If left blank, it is assumed
@@ -2398,12 +2398,12 @@ DIAFILE_DIRS           =
 # generate a warning when it encounters a \startuml command in this case and
 # will not generate output for the diagram.
 
-PLANTUML_JAR_PATH      = 
+PLANTUML_JAR_PATH      =
 
 # When using plantuml, the specified paths are searched for files specified by
 # the !include statement in a plantuml block.
 
-PLANTUML_INCLUDE_PATH  = 
+PLANTUML_INCLUDE_PATH  =
 
 # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
 # that will be shown in the graph. If the number of nodes in a graph becomes
diff --git a/ROCm_Libraries/rocr/src/README.md b/ROCm_Libraries/rocr/src/README.md
index f06be976..c4a790c9 100644
--- a/ROCm_Libraries/rocr/src/README.md
+++ b/ROCm_Libraries/rocr/src/README.md
@@ -40,7 +40,7 @@ hsakmt.h header file must be available. The latest version of these files
 can be obtained from the ROCT-Thunk-Interface repository, available here:
 
 https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface
- 
+
 Specify the directory containing libhsakmt.so.1 and hsakmt.h using the cmake variables, HSAKMT_LIB_PATH and HSAKMT_INC_PATH.  These can be specified either on the command line
 or via standard cmake configuration tools such as ccmake or cmake-gui.
 
@@ -52,7 +52,7 @@ For example, from the top level ROCR repository execute:
           -DHSAKMT_LIB_PATH:STRING=<path to directory holding libhsakmt.so.1> \
           ..
     make
-    
+
 alternately using ccmake:
 
     mkdir build
diff --git a/ROCm_Libraries/rocr/src/cmake_modules/COPYING-CMAKE-SCRIPTS b/ROCm_Libraries/rocr/src/cmake_modules/COPYING-CMAKE-SCRIPTS
index 4b417765..53b6b71e 100644
--- a/ROCm_Libraries/rocr/src/cmake_modules/COPYING-CMAKE-SCRIPTS
+++ b/ROCm_Libraries/rocr/src/cmake_modules/COPYING-CMAKE-SCRIPTS
@@ -7,7 +7,7 @@ are met:
 2. Redistributions in binary form must reproduce the copyright
    notice, this list of conditions and the following disclaimer in the
    documentation and/or other materials provided with the distribution.
-3. The name of the author may not be used to endorse or promote products 
+3. The name of the author may not be used to endorse or promote products
    derived from this software without specific prior written permission.
 
 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
diff --git a/ROCm_Libraries/rocr/src/cmake_modules/utils.cmake b/ROCm_Libraries/rocr/src/cmake_modules/utils.cmake
index 0530c87f..44a62e62 100644
--- a/ROCm_Libraries/rocr/src/cmake_modules/utils.cmake
+++ b/ROCm_Libraries/rocr/src/cmake_modules/utils.cmake
@@ -90,21 +90,21 @@ function ( get_version DEFAULT_VERSION_STRING )
     parse_version ( ${DEFAULT_VERSION_STRING} )
 
 ##     find_program ( GIT NAMES git )
-## 
+##
 ##     if ( GIT )
-## 
+##
 ##         execute_process ( COMMAND git describe --tags --dirty --long
 ##                           WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
 ##                           OUTPUT_VARIABLE GIT_TAG_STRING
 ##                           OUTPUT_STRIP_TRAILING_WHITESPACE
 ##                           RESULT_VARIABLE RESULT )
-## 
+##
 ##         if ( ${RESULT} EQUAL 0 )
-## 
+##
 ##             parse_version ( ${GIT_TAG_STRING} )
-## 
+##
 ##         endif ()
-## 
+##
 ##     endif ()
 
     set( VERSION_STRING "${VERSION_STRING}" PARENT_SCOPE )
diff --git a/ROCm_Libraries/rocr/src/core/common/shared.h b/ROCm_Libraries/rocr/src/core/common/shared.h
index dc33ac7d..5ca99d93 100644
--- a/ROCm_Libraries/rocr/src/core/common/shared.h
+++ b/ROCm_Libraries/rocr/src/core/common/shared.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/inc/agent.h b/ROCm_Libraries/rocr/src/core/inc/agent.h
index 8a1b4050..0760df70 100644
--- a/ROCm_Libraries/rocr/src/core/inc/agent.h
+++ b/ROCm_Libraries/rocr/src/core/inc/agent.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/inc/amd_blit_kernel.h b/ROCm_Libraries/rocr/src/core/inc/amd_blit_kernel.h
index b7e63d03..07d5229d 100644
--- a/ROCm_Libraries/rocr/src/core/inc/amd_blit_kernel.h
+++ b/ROCm_Libraries/rocr/src/core/inc/amd_blit_kernel.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/inc/amd_blit_sdma.h b/ROCm_Libraries/rocr/src/core/inc/amd_blit_sdma.h
index 181cd687..dc40421b 100644
--- a/ROCm_Libraries/rocr/src/core/inc/amd_blit_sdma.h
+++ b/ROCm_Libraries/rocr/src/core/inc/amd_blit_sdma.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/inc/amd_cpu_agent.h b/ROCm_Libraries/rocr/src/core/inc/amd_cpu_agent.h
index af5de53d..dd994ef2 100644
--- a/ROCm_Libraries/rocr/src/core/inc/amd_cpu_agent.h
+++ b/ROCm_Libraries/rocr/src/core/inc/amd_cpu_agent.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/inc/amd_elf_image.hpp b/ROCm_Libraries/rocr/src/core/inc/amd_elf_image.hpp
index c0cde933..bd181757 100644
--- a/ROCm_Libraries/rocr/src/core/inc/amd_elf_image.hpp
+++ b/ROCm_Libraries/rocr/src/core/inc/amd_elf_image.hpp
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/inc/amd_gpu_agent.h b/ROCm_Libraries/rocr/src/core/inc/amd_gpu_agent.h
index db299842..fef245a3 100644
--- a/ROCm_Libraries/rocr/src/core/inc/amd_gpu_agent.h
+++ b/ROCm_Libraries/rocr/src/core/inc/amd_gpu_agent.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/inc/amd_hsa_code.hpp b/ROCm_Libraries/rocr/src/core/inc/amd_hsa_code.hpp
index b3fcbc2f..06b577c2 100644
--- a/ROCm_Libraries/rocr/src/core/inc/amd_hsa_code.hpp
+++ b/ROCm_Libraries/rocr/src/core/inc/amd_hsa_code.hpp
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/inc/amd_hsa_loader.hpp b/ROCm_Libraries/rocr/src/core/inc/amd_hsa_loader.hpp
index 4b90f0e2..0e743bb2 100644
--- a/ROCm_Libraries/rocr/src/core/inc/amd_hsa_loader.hpp
+++ b/ROCm_Libraries/rocr/src/core/inc/amd_hsa_loader.hpp
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/inc/amd_loader_context.hpp b/ROCm_Libraries/rocr/src/core/inc/amd_loader_context.hpp
index 27830ff3..4254e4cc 100644
--- a/ROCm_Libraries/rocr/src/core/inc/amd_loader_context.hpp
+++ b/ROCm_Libraries/rocr/src/core/inc/amd_loader_context.hpp
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/inc/amd_memory_region.h b/ROCm_Libraries/rocr/src/core/inc/amd_memory_region.h
index 08bb78d9..dad165f0 100644
--- a/ROCm_Libraries/rocr/src/core/inc/amd_memory_region.h
+++ b/ROCm_Libraries/rocr/src/core/inc/amd_memory_region.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/inc/amd_topology.h b/ROCm_Libraries/rocr/src/core/inc/amd_topology.h
index f0c0eabc..8e62679d 100644
--- a/ROCm_Libraries/rocr/src/core/inc/amd_topology.h
+++ b/ROCm_Libraries/rocr/src/core/inc/amd_topology.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/inc/blit.h b/ROCm_Libraries/rocr/src/core/inc/blit.h
index 57189361..e7427e43 100644
--- a/ROCm_Libraries/rocr/src/core/inc/blit.h
+++ b/ROCm_Libraries/rocr/src/core/inc/blit.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/inc/checked.h b/ROCm_Libraries/rocr/src/core/inc/checked.h
index 856d22ba..ea2b2122 100644
--- a/ROCm_Libraries/rocr/src/core/inc/checked.h
+++ b/ROCm_Libraries/rocr/src/core/inc/checked.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/inc/hsa_api_trace_int.h b/ROCm_Libraries/rocr/src/core/inc/hsa_api_trace_int.h
index b41a8161..f458deb1 100644
--- a/ROCm_Libraries/rocr/src/core/inc/hsa_api_trace_int.h
+++ b/ROCm_Libraries/rocr/src/core/inc/hsa_api_trace_int.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/inc/hsa_ext_interface.h b/ROCm_Libraries/rocr/src/core/inc/hsa_ext_interface.h
index 236a165c..7b664003 100644
--- a/ROCm_Libraries/rocr/src/core/inc/hsa_ext_interface.h
+++ b/ROCm_Libraries/rocr/src/core/inc/hsa_ext_interface.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/inc/hsa_internal.h b/ROCm_Libraries/rocr/src/core/inc/hsa_internal.h
index 8f1f7610..3b4151a5 100644
--- a/ROCm_Libraries/rocr/src/core/inc/hsa_internal.h
+++ b/ROCm_Libraries/rocr/src/core/inc/hsa_internal.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/inc/hsa_table_interface.h b/ROCm_Libraries/rocr/src/core/inc/hsa_table_interface.h
index 8571c9a1..21081d0f 100644
--- a/ROCm_Libraries/rocr/src/core/inc/hsa_table_interface.h
+++ b/ROCm_Libraries/rocr/src/core/inc/hsa_table_interface.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/inc/isa.h b/ROCm_Libraries/rocr/src/core/inc/isa.h
index c5dba5f7..13fa38b8 100644
--- a/ROCm_Libraries/rocr/src/core/inc/isa.h
+++ b/ROCm_Libraries/rocr/src/core/inc/isa.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/inc/memory_region.h b/ROCm_Libraries/rocr/src/core/inc/memory_region.h
index 6281413d..391a6607 100644
--- a/ROCm_Libraries/rocr/src/core/inc/memory_region.h
+++ b/ROCm_Libraries/rocr/src/core/inc/memory_region.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/inc/registers.h b/ROCm_Libraries/rocr/src/core/inc/registers.h
index 39d86aec..d2bffb65 100644
--- a/ROCm_Libraries/rocr/src/core/inc/registers.h
+++ b/ROCm_Libraries/rocr/src/core/inc/registers.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/runtime/amd_blit_kernel.cpp b/ROCm_Libraries/rocr/src/core/runtime/amd_blit_kernel.cpp
index f1f235c2..846c0d71 100644
--- a/ROCm_Libraries/rocr/src/core/runtime/amd_blit_kernel.cpp
+++ b/ROCm_Libraries/rocr/src/core/runtime/amd_blit_kernel.cpp
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/runtime/amd_cpu_agent.cpp b/ROCm_Libraries/rocr/src/core/runtime/amd_cpu_agent.cpp
index d97bebf7..adf1d207 100644
--- a/ROCm_Libraries/rocr/src/core/runtime/amd_cpu_agent.cpp
+++ b/ROCm_Libraries/rocr/src/core/runtime/amd_cpu_agent.cpp
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
@@ -187,12 +187,12 @@ hsa_status_t CpuAgent::IterateCache(hsa_status_t (*callback)(hsa_cache_t cache,
 }
 
 hsa_status_t CpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
-  
+
   // agent, and vendor name size limit
   const size_t attribute_u = static_cast<size_t>(attribute);
-  
+
   switch (attribute_u) {
-    
+
     // The code copies HsaNodeProperties.MarketingName a Unicode string
     // which is encoded in UTF-16 as a 7-bit ASCII string. The value of
     // HsaNodeProperties.MarketingName is obtained from the "model name"
diff --git a/ROCm_Libraries/rocr/src/core/runtime/amd_gpu_agent.cpp b/ROCm_Libraries/rocr/src/core/runtime/amd_gpu_agent.cpp
index 9706ca07..3d1ddd9c 100644
--- a/ROCm_Libraries/rocr/src/core/runtime/amd_gpu_agent.cpp
+++ b/ROCm_Libraries/rocr/src/core/runtime/amd_gpu_agent.cpp
@@ -696,12 +696,12 @@ hsa_status_t GpuAgent::EnableDmaProfiling(bool enable) {
 }
 
 hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
-  
+
   // agent, and vendor name size limit
   const size_t attribute_u = static_cast<size_t>(attribute);
-  
+
   switch (attribute_u) {
-    
+
     // Build agent name by concatenating the Major, Minor and Stepping Ids
     // of devices compute capability with a prefix of "gfx"
     case HSA_AGENT_INFO_NAME: {
@@ -873,7 +873,7 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
     case HSA_AMD_AGENT_INFO_MEMORY_MAX_FREQUENCY:
       *((uint32_t*)value) = memory_max_frequency_;
       break;
-    
+
     // The code copies HsaNodeProperties.MarketingName a Unicode string
     // which is encoded in UTF-16 as a 7-bit ASCII string
     case HSA_AMD_AGENT_INFO_PRODUCT_NAME: {
diff --git a/ROCm_Libraries/rocr/src/core/runtime/amd_loader_context.cpp b/ROCm_Libraries/rocr/src/core/runtime/amd_loader_context.cpp
index 14b2b4de..ce45d47f 100644
--- a/ROCm_Libraries/rocr/src/core/runtime/amd_loader_context.cpp
+++ b/ROCm_Libraries/rocr/src/core/runtime/amd_loader_context.cpp
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/runtime/amd_memory_region.cpp b/ROCm_Libraries/rocr/src/core/runtime/amd_memory_region.cpp
index 97daa850..51bdbe19 100644
--- a/ROCm_Libraries/rocr/src/core/runtime/amd_memory_region.cpp
+++ b/ROCm_Libraries/rocr/src/core/runtime/amd_memory_region.cpp
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/runtime/hsa_ext_interface.cpp b/ROCm_Libraries/rocr/src/core/runtime/hsa_ext_interface.cpp
index 1fc08ca8..b8db95ba 100644
--- a/ROCm_Libraries/rocr/src/core/runtime/hsa_ext_interface.cpp
+++ b/ROCm_Libraries/rocr/src/core/runtime/hsa_ext_interface.cpp
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
@@ -176,7 +176,7 @@ ExtensionEntryPoints::ExtensionEntryPoints() {
 
 // Initialize Finalizer function table to be NULLs
 void ExtensionEntryPoints::InitFinalizerExtTable() {
-  
+
   // Initialize Version of Api Table
   finalizer_api.version.major_id = 0x00;
   finalizer_api.version.minor_id = 0x00;
@@ -192,7 +192,7 @@ void ExtensionEntryPoints::InitFinalizerExtTable() {
 
 // Initialize Image function table to be NULLs
 void ExtensionEntryPoints::InitImageExtTable() {
- 
+
   // Initialize Version of Api Table
   image_api.version.major_id = 0x00;
   image_api.version.minor_id = 0x00;
@@ -224,16 +224,16 @@ void ExtensionEntryPoints::InitAmdExtTable() {
 // @note: Interface should be updated when Amd Ext table
 // begins hosting Api's from other extension libraries
 void ExtensionEntryPoints::UpdateAmdExtTable(void *func_ptr) {
-  
+
   assert(hsa_api_table_.amd_ext_api.hsa_amd_image_create_fn ==
-             (decltype(hsa_amd_image_create)*)hsa_ext_null && 
+             (decltype(hsa_amd_image_create)*)hsa_ext_null &&
              "Duplicate load of extension import.");
   assert(hsa_internal_api_table_.amd_ext_api.hsa_amd_image_create_fn ==
-             (decltype(hsa_amd_image_create)*)hsa_ext_null && 
+             (decltype(hsa_amd_image_create)*)hsa_ext_null &&
              "Duplicate load of extension import.");
-  hsa_api_table_.amd_ext_api.hsa_amd_image_create_fn = 
+  hsa_api_table_.amd_ext_api.hsa_amd_image_create_fn =
              (decltype(hsa_amd_image_create)*)func_ptr;
-  hsa_internal_api_table_.amd_ext_api.hsa_amd_image_create_fn = 
+  hsa_internal_api_table_.amd_ext_api.hsa_amd_image_create_fn =
              (decltype(hsa_amd_image_create)*)func_ptr;
 }
 
@@ -265,7 +265,7 @@ bool ExtensionEntryPoints::LoadImage(std::string library_name) {
     return false;
   }
   libs_.push_back(lib);
-  
+
   void* ptr;
 
   ptr = os::GetExportAddress(lib, "hsa_ext_image_get_capability_impl");
@@ -390,7 +390,7 @@ bool ExtensionEntryPoints::LoadImage(std::string library_name) {
   if (ptr != NULL) {
     UpdateAmdExtTable(ptr);
   }
- 
+
   // Initialize Version of Api Table
   image_api.version.major_id = HSA_IMAGE_API_TABLE_MAJOR_VERSION;
   image_api.version.minor_id = sizeof(ImageExtTable);
@@ -414,7 +414,7 @@ bool ExtensionEntryPoints::LoadFinalizer(std::string library_name) {
     return false;
   }
   libs_.push_back(lib);
-  
+
   void* ptr;
 
   ptr = os::GetExportAddress(lib, "hsa_ext_program_create_impl");
@@ -469,12 +469,12 @@ bool ExtensionEntryPoints::LoadFinalizer(std::string library_name) {
     finalizer_api.hsa_ext_program_finalize_fn =
         (decltype(::hsa_ext_program_finalize)*)ptr;
   }
-  
+
   // Initialize Version of Api Table
   finalizer_api.version.major_id = HSA_FINALIZER_API_TABLE_MAJOR_VERSION;
   finalizer_api.version.minor_id = sizeof(::FinalizerExtTable);
   finalizer_api.version.step_id = HSA_FINALIZER_API_TABLE_STEP_VERSION;
- 
+
   // Update handle of table of HSA extensions
   hsa_internal_api_table_.CloneExts(&finalizer_api,
                                     core::HsaApiTable::HSA_EXT_FINALIZER_API_TABLE_ID);
diff --git a/ROCm_Libraries/rocr/src/core/runtime/interrupt_signal.cpp b/ROCm_Libraries/rocr/src/core/runtime/interrupt_signal.cpp
index 4342decc..b2fb6a3a 100644
--- a/ROCm_Libraries/rocr/src/core/runtime/interrupt_signal.cpp
+++ b/ROCm_Libraries/rocr/src/core/runtime/interrupt_signal.cpp
@@ -193,7 +193,7 @@ hsa_signal_value_t InterruptSignal::WaitRelaxed(
       value = atomic::Load(&signal_.value, std::memory_order_relaxed);
       return hsa_signal_value_t(value);
     }
-    
+
     if (wait_hint == HSA_WAIT_STATE_ACTIVE) {
       continue;
     }
diff --git a/ROCm_Libraries/rocr/src/core/runtime/isa.cpp b/ROCm_Libraries/rocr/src/core/runtime/isa.cpp
index 7c9768c1..bc916ea8 100644
--- a/ROCm_Libraries/rocr/src/core/runtime/isa.cpp
+++ b/ROCm_Libraries/rocr/src/core/runtime/isa.cpp
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/runtime/runtime.cpp b/ROCm_Libraries/rocr/src/core/runtime/runtime.cpp
index d4896424..f381305b 100644
--- a/ROCm_Libraries/rocr/src/core/runtime/runtime.cpp
+++ b/ROCm_Libraries/rocr/src/core/runtime/runtime.cpp
@@ -374,7 +374,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) {
 
   /*
   GPU-GPU - functional support, not a performance path.
-  
+
   This goes through system memory because we have to support copying between non-peer GPUs
   and we can't use P2P pointers even if the GPUs are peers.  Because hsa_amd_agents_allow_access
   requires the caller to specify all allowed agents we can't assume that a peer mapped pointer
diff --git a/ROCm_Libraries/rocr/src/core/runtime/signal.cpp b/ROCm_Libraries/rocr/src/core/runtime/signal.cpp
index fa24c421..e0890a47 100644
--- a/ROCm_Libraries/rocr/src/core/runtime/signal.cpp
+++ b/ROCm_Libraries/rocr/src/core/runtime/signal.cpp
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/util/atomic_helpers.h b/ROCm_Libraries/rocr/src/core/util/atomic_helpers.h
index 69a2a58a..c162629c 100644
--- a/ROCm_Libraries/rocr/src/core/util/atomic_helpers.h
+++ b/ROCm_Libraries/rocr/src/core/util/atomic_helpers.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/util/lnx/os_linux.cpp b/ROCm_Libraries/rocr/src/core/util/lnx/os_linux.cpp
index 24974185..86be5524 100644
--- a/ROCm_Libraries/rocr/src/core/util/lnx/os_linux.cpp
+++ b/ROCm_Libraries/rocr/src/core/util/lnx/os_linux.cpp
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/util/locks.h b/ROCm_Libraries/rocr/src/core/util/locks.h
index 4b13c1e9..0a593667 100644
--- a/ROCm_Libraries/rocr/src/core/util/locks.h
+++ b/ROCm_Libraries/rocr/src/core/util/locks.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/util/os.h b/ROCm_Libraries/rocr/src/core/util/os.h
index 51031786..57b3eb2e 100644
--- a/ROCm_Libraries/rocr/src/core/util/os.h
+++ b/ROCm_Libraries/rocr/src/core/util/os.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/util/small_heap.cpp b/ROCm_Libraries/rocr/src/core/util/small_heap.cpp
index 6cd8e117..8c3b8560 100644
--- a/ROCm_Libraries/rocr/src/core/util/small_heap.cpp
+++ b/ROCm_Libraries/rocr/src/core/util/small_heap.cpp
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/util/small_heap.h b/ROCm_Libraries/rocr/src/core/util/small_heap.h
index d9064bba..824f5681 100644
--- a/ROCm_Libraries/rocr/src/core/util/small_heap.h
+++ b/ROCm_Libraries/rocr/src/core/util/small_heap.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
@@ -51,7 +51,7 @@
 #include <set>
 
 #include "utils.h"
-  
+
 class SmallHeap {
  private:
   struct Node;
diff --git a/ROCm_Libraries/rocr/src/core/util/timer.cpp b/ROCm_Libraries/rocr/src/core/util/timer.cpp
index a2cf13fb..f4476c11 100644
--- a/ROCm_Libraries/rocr/src/core/util/timer.cpp
+++ b/ROCm_Libraries/rocr/src/core/util/timer.cpp
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/util/timer.h b/ROCm_Libraries/rocr/src/core/util/timer.h
index 914bda34..42179956 100644
--- a/ROCm_Libraries/rocr/src/core/util/timer.h
+++ b/ROCm_Libraries/rocr/src/core/util/timer.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/core/util/utils.h b/ROCm_Libraries/rocr/src/core/util/utils.h
index f7f09e9d..312bf044 100755
--- a/ROCm_Libraries/rocr/src/core/util/utils.h
+++ b/ROCm_Libraries/rocr/src/core/util/utils.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/inc/amd_hsa_common.h b/ROCm_Libraries/rocr/src/inc/amd_hsa_common.h
index bfb613ec..96b604ce 100644
--- a/ROCm_Libraries/rocr/src/inc/amd_hsa_common.h
+++ b/ROCm_Libraries/rocr/src/inc/amd_hsa_common.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/inc/amd_hsa_elf.h b/ROCm_Libraries/rocr/src/inc/amd_hsa_elf.h
index 60f0c6d4..0f6003dd 100644
--- a/ROCm_Libraries/rocr/src/inc/amd_hsa_elf.h
+++ b/ROCm_Libraries/rocr/src/inc/amd_hsa_elf.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/inc/amd_hsa_kernel_code.h b/ROCm_Libraries/rocr/src/inc/amd_hsa_kernel_code.h
index 6c2742a6..34e81b97 100644
--- a/ROCm_Libraries/rocr/src/inc/amd_hsa_kernel_code.h
+++ b/ROCm_Libraries/rocr/src/inc/amd_hsa_kernel_code.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/inc/amd_hsa_signal.h b/ROCm_Libraries/rocr/src/inc/amd_hsa_signal.h
index 57aa1adc..deefc8f0 100644
--- a/ROCm_Libraries/rocr/src/inc/amd_hsa_signal.h
+++ b/ROCm_Libraries/rocr/src/inc/amd_hsa_signal.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/inc/hsa.h b/ROCm_Libraries/rocr/src/inc/hsa.h
index 3979219a..65db804a 100644
--- a/ROCm_Libraries/rocr/src/inc/hsa.h
+++ b/ROCm_Libraries/rocr/src/inc/hsa.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
@@ -632,7 +632,7 @@ hsa_status_t HSA_API hsa_system_major_extension_supported(
     uint16_t version_major,
     uint16_t *version_minor,
     bool* result);
-    
+
 
 /**
  * @deprecated
@@ -711,7 +711,7 @@ hsa_status_t HSA_API hsa_system_get_major_extension_table(
     uint16_t extension,
     uint16_t version_major,
     size_t table_length,
-    void *table);    
+    void *table);
 
 /**
  * @brief Struct containing an opaque handle to an agent, a device that participates in
@@ -1283,7 +1283,7 @@ hsa_status_t HSA_API hsa_agent_major_extension_supported(
     uint16_t version_major,
     uint16_t *version_minor,
     bool* result);
-    
+
 
 /** @} */
 
diff --git a/ROCm_Libraries/rocr/src/inc/hsa_ext_amd.h b/ROCm_Libraries/rocr/src/inc/hsa_ext_amd.h
index add80e52..ca9a23d6 100644
--- a/ROCm_Libraries/rocr/src/inc/hsa_ext_amd.h
+++ b/ROCm_Libraries/rocr/src/inc/hsa_ext_amd.h
@@ -714,7 +714,7 @@ typedef enum {
   HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7,
   /**
   * This memory_pool can be made directly accessible by all the agents in the
-  * system (::hsa_amd_agent_memory_pool_get_info does not return 
+  * system (::hsa_amd_agent_memory_pool_get_info does not return
   * ::HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED for any agent). The type of this
   * attribute is bool.
   */
diff --git a/ROCm_Libraries/rocr/src/inc/hsa_ext_finalize.h b/ROCm_Libraries/rocr/src/inc/hsa_ext_finalize.h
index 014e49bf..1aeb92d0 100644
--- a/ROCm_Libraries/rocr/src/inc/hsa_ext_finalize.h
+++ b/ROCm_Libraries/rocr/src/inc/hsa_ext_finalize.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
diff --git a/ROCm_Libraries/rocr/src/inc/hsa_ext_image.h b/ROCm_Libraries/rocr/src/inc/hsa_ext_image.h
index de358c3d..d64de9d2 100644
--- a/ROCm_Libraries/rocr/src/inc/hsa_ext_image.h
+++ b/ROCm_Libraries/rocr/src/inc/hsa_ext_image.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
@@ -54,7 +54,7 @@
 
 #ifdef __cplusplus
 extern "C" {
-#endif /*__cplusplus*/ 
+#endif /*__cplusplus*/
 
 /** \defgroup ext-images Images and Samplers
  *  @{
@@ -267,7 +267,7 @@ typedef enum {
  * @brief A fixed-size type used to represent ::hsa_ext_image_channel_type_t constants.
  */
 typedef uint32_t hsa_ext_image_channel_type32_t;
-    
+
 /**
  *
  * @brief Channel order associated with the elements of an image. See
@@ -303,7 +303,7 @@ typedef enum {
  * @brief A fixed-size type used to represent ::hsa_ext_image_channel_order_t constants.
  */
 typedef uint32_t hsa_ext_image_channel_order32_t;
-    
+
 
 /**
  * @brief Image format.
@@ -1170,7 +1170,7 @@ typedef enum {
  * @brief A fixed-size type used to represent ::hsa_ext_sampler_coordinate_mode_t constants.
  */
 typedef uint32_t hsa_ext_sampler_coordinate_mode32_t;
-    
+
 
 /**
  * @brief Sampler filter modes. See the <em>Filter Mode</em> section
@@ -1446,9 +1446,9 @@ typedef struct hsa_ext_images_1_pfn_s {
 
 } hsa_ext_images_1_pfn_t;
 /** @} */
-    
+
 #ifdef __cplusplus
 }  // end extern "C" block
-#endif /*__cplusplus*/ 
+#endif /*__cplusplus*/
 
 #endif
diff --git a/ROCm_Libraries/rocr/src/inc/hsa_ven_amd_aqlprofile.h b/ROCm_Libraries/rocr/src/inc/hsa_ven_amd_aqlprofile.h
index f087709d..184fc654 100644
--- a/ROCm_Libraries/rocr/src/inc/hsa_ven_amd_aqlprofile.h
+++ b/ROCm_Libraries/rocr/src/inc/hsa_ven_amd_aqlprofile.h
@@ -75,7 +75,7 @@ uint32_t hsa_ven_amd_aqlprofile_version_minor();
 // output data.
 //
 // Returned status:
-//     hsa_status_t – HSA status codes are used from hsa.h header
+//     hsa_status_t - HSA status codes are used from hsa.h header
 //
 // Supported profiling features:
 //
@@ -91,7 +91,7 @@ typedef enum {
 
 // Supported performance counters (PMC) blocks
 // The block ID is the same for a block instances set, for example
-// each block instance from the TCC block set, TCC0, TCC1, …, TCCN
+// each block instance from the TCC block set, TCC0, TCC1, ..., TCCN
 // will have the same block ID HSA_VEN_AMD_AQLPROFILE_BLOCKS_TCC.
 typedef enum {
   HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_CPC = 0,
@@ -132,8 +132,8 @@ typedef enum {
 } hsa_ven_amd_aqlprofile_block_name_t;
 
 // PMC event object structure
-// ‘counter_id’ value is specified in GFXIPs perfcounter user guides
-// which is the counters select value, “Performance Counters Selection”
+// 'counter_id' value is specified in GFXIPs perfcounter user guides
+// which is the counters select value, "Performance Counters Selection"
 // chapter.
 typedef struct {
   hsa_ven_amd_aqlprofile_block_name_t block_name;
@@ -242,7 +242,7 @@ hsa_status_t hsa_ven_amd_aqlprofile_legacy_get_pm4(
 // Get profile info:
 // Generic method for getting various profile info including profile buffers
 // attributes like the command buffer size and the profiling PMC results.
-// It’s implied that all counters are 64bit values.
+// It's implied that all counters are 64bit values.
 //
 // Profile generic output data:
 typedef struct {
diff --git a/ROCm_Libraries/rocr/src/libamdhsacode/amd_elf_image.cpp b/ROCm_Libraries/rocr/src/libamdhsacode/amd_elf_image.cpp
index d24e1984..b3f33949 100644
--- a/ROCm_Libraries/rocr/src/libamdhsacode/amd_elf_image.cpp
+++ b/ROCm_Libraries/rocr/src/libamdhsacode/amd_elf_image.cpp
@@ -1550,7 +1550,7 @@ namespace amd {
       }
     }
 
-    GElfStringTable* GElfImage::addStringTable(const std::string& name) 
+    GElfStringTable* GElfImage::addStringTable(const std::string& name)
     {
       GElfStringTable* stab = new GElfStringTable(this);
       sections.push_back(std::unique_ptr<GElfStringTable>(stab));
diff --git a/ROCm_Libraries/rocr/src/loader/loaders.hpp b/ROCm_Libraries/rocr/src/loader/loaders.hpp
index 94b3ceca..40540f8f 100644
--- a/ROCm_Libraries/rocr/src/loader/loaders.hpp
+++ b/ROCm_Libraries/rocr/src/loader/loaders.hpp
@@ -70,7 +70,7 @@ namespace loader {
     void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, bool zero) override;
 
     bool SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) override;
-    
+
     void SegmentFree(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size = 0) override;
 
     void* SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) override;
diff --git a/ROCm_Network_Based_Programing/ROCm_RDMA.rst b/ROCm_Network_Based_Programing/ROCm_RDMA.rst
index 725a36f6..4ea47661 100644
--- a/ROCm_Network_Based_Programing/ROCm_RDMA.rst
+++ b/ROCm_Network_Based_Programing/ROCm_RDMA.rst
@@ -1,7 +1,7 @@
 RDMA
 OpenMPI
 MPICH
-GasNet 
+GasNet
 OpenSHEMM
-Chapel 
+Chapel
 UPC++
diff --git a/ROCm_Solutions/ROCr_Error_Codes.rst b/ROCm_Solutions/ROCr_Error_Codes.rst
index f63771f1..096ce2c4 100644
--- a/ROCm_Solutions/ROCr_Error_Codes.rst
+++ b/ROCm_Solutions/ROCr_Error_Codes.rst
@@ -12,7 +12,7 @@ HSA Runtime Queue Error Codes
 +-----------+-------------------------+
 | 64        | Group is too large      |
 +-----------+-------------------------+
-| 128       | Out of VGPR’s           |
+| 128       | Out of VGPR's           |
 +-----------+-------------------------+
 | 0x80000000| Debug Trap              |
 +-----------+-------------------------+
diff --git a/ROCm_System_Managment/ROCm-System-Managment.rst b/ROCm_System_Managment/ROCm-System-Managment.rst
index b7912458..c0ed32dd 100644
--- a/ROCm_System_Managment/ROCm-System-Managment.rst
+++ b/ROCm_System_Managment/ROCm-System-Managment.rst
@@ -58,8 +58,8 @@ usage: rocm-smi [-h] [-d DEVICE [DEVICE ...]] [--alldevices] [--showhw] [-a] [-i
 =================================== ===================================================================================
   -h, --help                  		show this help message and exit
   --gpureset                            Reset specified GPU (One GPU must be specified)
-  --load FILE                 		Load Clock, Fan, Performance and Profile settings 
-  --save FILE                 		Save Clock, Fan, Performance and Profile settings 
+  --load FILE                 		Load Clock, Fan, Performance and Profile settings
+  --save FILE                 		Save Clock, Fan, Performance and Profile settings
 =================================== ===================================================================================
 
 
@@ -186,8 +186,8 @@ If the level ends with a %, the fan speed is calculated as pct*maxlevel/100 (max
 
 .. NOTES::
     This option can be used in conjunction with the --setsclk/--setmclk mask
- 
-    Operating the GPU outside of specifications can cause irreparable damage to your hardware 
+
+    Operating the GPU outside of specifications can cause irreparable damage to your hardware
     Please observe the warning displayed when using this option
 
     This flag automatically sets the clock to the highest level, as only the highest level is increased by the OverDrive value
@@ -231,16 +231,16 @@ If the level ends with a %, the fan speed is calculated as pct*maxlevel/100 (max
 **Clock Type Descriptions**
 
 
-DCEFCLK - DCE (Display) FCLK - Data fabric (VG20 and later) - Data flow from XGMI, Memory, PCIe SCLK - GFXCLK (Graphics core) 
+DCEFCLK - DCE (Display) FCLK - Data fabric (VG20 and later) - Data flow from XGMI, Memory, PCIe SCLK - GFXCLK (Graphics core)
 
 
 .. Note::
 
     SOCCLK split from SCLK as of Vega10. Pre-Vega10 they were both controlled by SCLK
 
-MCLK - GPU Memory (VRAM) PCLK - PCIe bus 
+MCLK - GPU Memory (VRAM) PCLK - PCIe bus
 
-.. Note:: 
+.. Note::
 
     This gives 2 speeds, PCIe Gen1 x1 and the highest available based on the hardware
 
@@ -346,12 +346,12 @@ All entries (except name) are optional, and should only be created in a given dr
 
 
 *********************
- Global attributes 
+ Global attributes
 *********************
 
 ================ ============================================================================================
 name		  | The chip name.This should be a short, lowercase string, not containing whitespace,
-		  | dashes, or the wildcard character '*'.This attribute represents the chip name. 
+		  | dashes, or the wildcard character '*'.This attribute represents the chip name.
 		  | It is the only mandatory attribute.I2C devices get this attribute created automatically.
 		  | RO
 
@@ -363,28 +363,28 @@ update_interval	  | The interval at which the chip will update readings.
 ================ ============================================================================================
 
 ************
- Voltages 
+ Voltages
 ************
 
 ====================== ===============================================================================================
 in[0-*]_min	        |  Voltage min value.
 		        |  Unit: millivolt
 		        |  RW
-		
+
 in[0-*]_lcrit	        |  Voltage critical min value.
 		        |  Unit: millivolt
 		        |  RW
 		        |  If voltage drops to or below this limit, the system may take drastic action such as power
 		        |  down or reset. At the very least, it should report a fault.
- 
+
 in[0-*]_max	        | Voltage max value.
 		        | Unit: millivolt
 		        | RW
-		
+
 in[0-*]_crit	        | Voltage critical max value.
 		        | Unit: millivolt
 			| RW
-			| If voltage reaches or exceeds this limit, the system may take drastic action such as power 
+			| If voltage reaches or exceeds this limit, the system may take drastic action such as power
 			| down or reset. At the very least, it should report a fault.
 
 in[0-*]_input		| Voltage input value.
@@ -392,8 +392,8 @@ in[0-*]_input		| Voltage input value.
 			| RO
 			| Voltage measured on the chip pin.Actual voltage depends on the scaling resistors on the
 			| motherboard, as recommended in the chip datasheet.This varies by chip and by motherboard.
-			| Because of this variation, values are generally NOT scaled by the chip driver, and must be 
-			| done by the application.However, some drivers (notably lm87 and via686a) do scale, because 
+			| Because of this variation, values are generally NOT scaled by the chip driver, and must be
+			| done by the application.However, some drivers (notably lm87 and via686a) do scale, because
 			| of internal resistors built into a chip.These drivers will output the actual voltage. Rule of
 			| thumb: drivers should report the voltage values at the "pins" of the chip.
 
@@ -432,10 +432,10 @@ cpu[0-*]_vid		| CPU core reference voltage.
 			| RO
 			| Not always correct.
 
-vrm			| Voltage Regulator Module version number. 
+vrm			| Voltage Regulator Module version number.
 			| RW (but changing it should no more be necessary)
 			| Originally the VRM standard version multiplied by 10, but now an arbitrary number, as not
-			| all standards have a version number.Affects the way the driver calculates the CPU core 
+			| all standards have a version number.Affects the way the driver calculates the CPU core
 			| reference voltage from the vid pins.
 ====================== ===============================================================================================
 
@@ -443,7 +443,7 @@ Also see the Alarms section for status flags associated with voltages.
 
 
 ********
- Fans 
+ Fans
 ********
 
 =============== =============================================================================================
@@ -470,9 +470,9 @@ fan[1-*]_div	 | Fan divisor.
 fan[1-*]_pulses	 | Number of tachometer pulses per fan revolution.
 		 | Integer value, typically between 1 and 4.
 		 | RW
-		 | This value is a characteristic of the fan connected to the device's input, 
- 		 | so it has to be set in accordance with the fan model.Should only be created 
-		 | if the chip has a register to configure the number of pulses. In the absence 
+		 | This value is a characteristic of the fan connected to the device's input,
+ 		 | so it has to be set in accordance with the fan model.Should only be created
+		 | if the chip has a register to configure the number of pulses. In the absence
 		 | of such a register (and thus attribute) the value assumed by all devices is 2 pulses
 		 | per fan revolution.
 
@@ -484,7 +484,7 @@ fan[1-*]_target  | Desired fan speed
 
 fan[1-*]_label	 | Suggested fan channel label.
 		 | Text string
-		 | Should only be created if the driver has hints about what this fan channel is being 
+		 | Should only be created if the driver has hints about what this fan channel is being
 		 | used for, and user-space doesn't.In all other cases, the label is provided by user-space.
 		 | RO
 
@@ -499,13 +499,13 @@ Also see the Alarms section for status flags associated with fans.
 
 
 *******
- PWM 
+ PWM
 *******
-		
+
 +--------------------------------------+-----------------------------------------------------------------------------------------+
 | pwm[1-*]	 		       | | Pulse width modulation fan control.							 |
 |				       | | Integer value in the range 0 to 255							 |
-|				       | | RW											 | 
+|				       | | RW											 |
 |			               | | 255 is max or 100%.									 |
 +--------------------------------------+-----------------------------------------------------------------------------------------+
 | pwm[1-*]_enable  	   	       | | Fan speed control method:								 |
@@ -542,7 +542,7 @@ value (fastest fan speed) wins.
 
 
 ****************
- Temperatures 
+ Temperatures
 ****************
 
 ========================= ==========================================================================================
@@ -589,7 +589,7 @@ temp[1-*]_crit_hyst 	  | Temperature hysteresis value for critical limit.
 			  | Must be reported as an absolute temperature, NOT a delta from the critical value.
 			  | RW
 
-temp[1-*]_emergency       | Temperature emergency max value, for chips supporting more than two upper 
+temp[1-*]_emergency       | Temperature emergency max value, for chips supporting more than two upper
 			  | temperature limits. Must be equal or greater than corresponding temp_crit values.
 			  | Unit: millidegree Celsius
 			  | RW
@@ -613,8 +613,8 @@ temp[1-*]_offset          | Temperature offset which is added to the temperature
 			  | Read/Write value.
 
 temp[1-*]_label		  | Suggested temperature channel label.
-			  | Text string Should only be created if the driver has hints about what this temperature 
-			  | channel is being used for, and user-space doesn't. In all other cases, the label is 
+			  | Text string Should only be created if the driver has hints about what this temperature
+			  | channel is being used for, and user-space doesn't. In all other cases, the label is
 			  | provided by user-space.
 			  | RO
 
@@ -645,7 +645,7 @@ Also see the Alarms section for status flags associated with temperatures.
 
 
 ************
- Currents 
+ Currents
 ************
 
 ======================= ========================================================
@@ -697,7 +697,7 @@ curr[1-*]_enable         | Enable or disable the sensors
 Also see the Alarms section for status flags associated with currents.
 
 *********
- Power 
+ Power
 *********
 
 ================================ ===============================================================================
@@ -705,7 +705,7 @@ power[1-*]_average		 | Average power use
 				 | Unit: microWatt
 				 | RO
 
-power[1-*]_average_interval	 | Power use averaging interval.  A poll notification is sent to this 
+power[1-*]_average_interval	 | Power use averaging interval.  A poll notification is sent to this
  				 | file if the hardware changes the averaging interval.
 				 | Unit: milliseconds
 				 | RW
@@ -756,8 +756,8 @@ power[1-*]_accuracy		 | Accuracy of the power meter.
 				 | Unit: Percent
 				 | RO
 
-power[1-*]_cap			 | If power use rises above this limit, the system should take action to 
-				 | reduce power use.A poll notification is sent to this file if the cap is 
+power[1-*]_cap			 | If power use rises above this limit, the system should take action to
+				 | reduce power use.A poll notification is sent to this file if the cap is
 				 | changed by the hardware.The *_cap files only appear if the cap is known
 				 | to be enforced by hardware.
 				 | Unit: microWatt
@@ -796,7 +796,7 @@ power[1-*]_enable                | Enable or disable the sensors.
 Also see the Alarms section for status flags associated with power readings.
 
 **********
- Energy 
+ Energy
 **********
 
 ==================== ========================
@@ -812,7 +812,7 @@ energy[1-*]_enable   | Enable or disable the sensors
 ==================== ========================
 
 ************
- Humidity 
+ Humidity
 ************
 
 ==================== ===========================================
@@ -828,7 +828,7 @@ humidity[1-*]_enable | Enable or disable the sensors
 ==================== ===========================================
 
 **********
- Alarms 
+ Alarms
 **********
 
 Each channel or limit may have an associated alarm file, containing a
@@ -839,13 +839,13 @@ limit-related alarms, not both. The driver should just reflect the hardware
 implementation.
 
 +---------------------+------------------+
-| | in[0-*]_alarm     | | Channel alarm  | 
+| | in[0-*]_alarm     | | Channel alarm  |
 | | curr[1-*]_alarm   | | 0: no alarm    |
 | | power[1-*]_alarm  | | 1: alarm       |
 | | fan[1-*]_alarm    | | RO 	         |
-| | temp[1-*]_alarm   | 		 |	
+| | temp[1-*]_alarm   | 		 |
 +---------------------+------------------+
-		
+
 OR
 
 +----------------------------+---------------+
@@ -868,7 +868,7 @@ OR
 | | temp[1-*]_crit_alarm     |               |
 | | temp[1-*]_emergency_alarm|               |
 +----------------------------+---------------+
-		
+
 Each input channel may have an associated fault file. This can be used
 to notify open diodes, unconnected fans etc. where the hardware
 supports it. When this boolean has value 1, the measurement for that
@@ -878,23 +878,23 @@ channel should not be trusted.
 | | fan[1-*]_fault  | | Input fault condition |
 | | temp[1-*]_fault | | 0: no fault occurred  |
 |		    | | 1: fault condition    |
-|		    | | RO		      |	
+|		    | | RO		      |
 +-------------------+-------------------------+
-		
+
 Some chips also offer the possibility to get beeped when an alarm occurs:
 
 +-----------------+----------------------+
 | beep_enable     | | Master beep enable |
-|	          | | 0: no beeps        |	
-|	          | | 1: beeps	         |	
-|	          | | RW		 |	
+|	          | | 0: no beeps        |
+|	          | | 1: beeps	         |
+|	          | | RW		 |
 +-----------------+----------------------+
-| | in[0-*]_beep  | | Channel beep	 |	
+| | in[0-*]_beep  | | Channel beep	 |
 | | curr[1-*]_beep| | 0: disable	 |
 | | fan[1-*]_beep | | 1: enable 	 |
 | | temp[1-*]_beep| | RW 		 |
-+-----------------+----------------------+		
-		
++-----------------+----------------------+
+
 In theory, a chip could provide per-limit beep masking, but no such chip
 was seen so far.
 
@@ -926,7 +926,7 @@ beep_mask      | Bitmask for beep.
 
 
 ***********************
- Intrusion detection 
+ Intrusion detection
 ***********************
 
 ======================= ===========================================================
@@ -959,8 +959,8 @@ samples			| Sets number of average samples for all types of measurements.
                         | RW
 
 in_samples              | Sets number of average samples for specific type of measurements.
-power_samples           | Note that on some devices it won't be possible to set all of 
-curr_samples            | them to different values so changing one might also change 
+power_samples           | Note that on some devices it won't be possible to set all of
+curr_samples            | them to different values so changing one might also change
 curr_samples            | some others.
                         | RW
 
@@ -1021,10 +1021,10 @@ Example2, fan divider setting, valid values 2, 4 and 8:
 	/* write v to register */
 
 *********
-Performance 
+Performance
 *********
 
-The pcie_bw sysfs file will report the usage of the PCIe bus over the last second, as a string with 3 integers: "bytes-received bytes-sent mps" . As there is no efficient way to calculate the size of each packet transmitted to and from the GPU in real time, the maximum payload size (mps), or the largest size of a PCIe packet, is included. The estimated bandwidth can then be calculated using by "bytes-received*mps + bytes-sent*mps" sed and multiplied by the number of packets received and sent.  
+The pcie_bw sysfs file will report the usage of the PCIe bus over the last second, as a string with 3 integers: "bytes-received bytes-sent mps" . As there is no efficient way to calculate the size of each packet transmitted to and from the GPU in real time, the maximum payload size (mps), or the largest size of a PCIe packet, is included. The estimated bandwidth can then be calculated using by "bytes-received*mps + bytes-sent*mps" sed and multiplied by the number of packets received and sent.
 
 KFD Topology
 ==============
@@ -1032,7 +1032,7 @@ KFD Topology
 
 Application software needs to understand the properties of the underlying hardware to leverage the performance capabilities of the platform for feature utilization and task scheduling. The sysfs topology exposes this information in a loosely hierarchal order. The information is populated by the KFD driver is gathered from ACPI (CRAT) and AMDGPU base driver.
 
-| The sysfs topology is arranged hierarchically as following. The root directory of the topology is 
+| The sysfs topology is arranged hierarchically as following. The root directory of the topology is
 | **/sys/devices/virtual/kfd/kfd/topology/nodes/**
 
 Based on the platform inside this directory there will be sub-directories corresponding to each HSA Agent. A system with N HSA Agents will have N directories as shown below.
@@ -1053,12 +1053,12 @@ This is available in the root directory of the HSA agent. This provides informat
 
 Memory
 ********
-The memory bank information attached to this agent is populated in “mem_banks” subdirectory.
+The memory bank information attached to this agent is populated in "mem_banks" subdirectory.
 /sys/devices/virtual/kfd/kfd/topology/nodes/N/mem_banks
 
 Cache
 ********
-The caches available for this agent is populated in “cache” subdirectory
+The caches available for this agent is populated in "cache" subdirectory
 /sys/devices/virtual/kfd/kfd/topology/nodes/N/cache
 
 IO-LINKS
@@ -1069,7 +1069,7 @@ How to use topology information
 *********************************
 The information provided in sysfs should not be directly used by application software. Application software should always use Thunk library API (libhsakmt) to access topology information. Please refer to Thunk API for more information.
 
-The data are associated with a node ID, forming a per-node element list which references the elements contained at relative offsets within that list. A node associates with a kernel agent or agent. Node ID’s should be 0-based, with the “0” ID representing the primary elements of the system (e.g., “boot cores”, memory) if applicable. The enumeration order and—if applicable—values of the ID should match other information reported through mechanisms outside of the scope of the requirements;
+The data are associated with a node ID, forming a per-node element list which references the elements contained at relative offsets within that list. A node associates with a kernel agent or agent. Node ID's should be 0-based, with the "0" ID representing the primary elements of the system (e.g., "boot cores", memory) if applicable. The enumeration order and--if applicable--values of the ID should match other information reported through mechanisms outside of the scope of the requirements;
 
 For example, the data and enumeration order contained in the ACPI SRAT table on some systems should match the memory order and properties reported through HSA. Further detail is out of the scope of the System Architecture and outlined in the Runtime API specification.
 
@@ -1079,7 +1079,7 @@ Each of these nodes is interconnected with other nodes in more advanced systems
 
 .. image:: More_advanced_topology.png
 
-Where applicable, the node grouping of physical memory follows NUMA principles to leverage memory locality in software when multiple physical memory blocks are available in the system and agents have a different “access cost” (e.g., bandwidth/latency) to that memory.
+Where applicable, the node grouping of physical memory follows NUMA principles to leverage memory locality in software when multiple physical memory blocks are available in the system and agents have a different "access cost" (e.g., bandwidth/latency) to that memory.
 
 **KFD Topology structure for AMDGPU :**
 
@@ -1110,7 +1110,7 @@ This can used by cooperating applications to effectively allocate GPU/GCDs among
 Device cgroup
 ***************
 
-At a system administration level, the GPU/GCD isolation is possible using the device control group (cgroup). For all the AMD GPUs in a compute node, the ROCk-Kernel-Driver exposes a single compute device file /dev/kfd and a separate (Direct Rendering Infrastructure) render device files /dev/dri/renderDN for each device. To participate in the Linux kernel’s cgroup infrastructure, the ROCk driver relies on the render device files.
+At a system administration level, the GPU/GCD isolation is possible using the device control group (cgroup). For all the AMD GPUs in a compute node, the ROCk-Kernel-Driver exposes a single compute device file /dev/kfd and a separate (Direct Rendering Infrastructure) render device files /dev/dri/renderDN for each device. To participate in the Linux kernel's cgroup infrastructure, the ROCk driver relies on the render device files.
 
 For example, consider a compute node with the two AMD GPUs. The ROCk-Kernel-Driver exposes the following device files:
 
@@ -1122,9 +1122,9 @@ crw-rw---- 1 root video 226, 129 Apr 22 10:31 /dev/dri/renderD129
 
 A ROCm application running on this compute node can use both GPUs only if it has access to all the above-listed device files. The administrator can restrict the devices an application can access by using device cgroup. The device cgroup subsystem allows or denies access to devices by applications in a cgroup. If a cgroup has whitelisted only /dev/kfd and /dev/dri/renderD129, then applications in that cgroup will have access only to that single GPU.
 
-Refer to the Linux kernel's cgroup documentation for information on how to create a cgroup and whitelist devices. 
+Refer to the Linux kernel's cgroup documentation for information on how to create a cgroup and whitelist devices.
 
-For cgroup-v1, refer https://www.kernel.org/doc/Documentation/cgroup-v1/devices.txt 
+For cgroup-v1, refer https://www.kernel.org/doc/Documentation/cgroup-v1/devices.txt
 
 For cgroup-v2, refer https://www.kernel.org/doc/Documentation/cgroup-v2.txt
 
diff --git a/ROCm_System_Managment/topo1.rst b/ROCm_System_Managment/topo1.rst
index 2a0050e8..c3097968 100644
--- a/ROCm_System_Managment/topo1.rst
+++ b/ROCm_System_Managment/topo1.rst
@@ -9,7 +9,7 @@ sysfs-class-kfd-topology
 | Description:      Gives the details of system platform
 
 
-| What:             /sys/class/kfd/topology/platform_oem 
+| What:             /sys/class/kfd/topology/platform_oem
 | Date:             may 2018
 | KernelVersion:    4.13
 | description:      This field gives the OEM(original equipment manufacturer) ID. Identifies HSA platform, reflects the OEMID in the CRAT
diff --git a/ROCm_System_Managment/topo2.rst b/ROCm_System_Managment/topo2.rst
index 96da2168..1293cfc3 100644
--- a/ROCm_System_Managment/topo2.rst
+++ b/ROCm_System_Managment/topo2.rst
@@ -13,96 +13,96 @@ sysfs-class-kfd-topology-nodes-N
 |  Date:		May 2018
 |  KernelVersion:	4.13
 |  Description:	Here the number of smid (Single Instruction Multiple Data architecture) processes count is registered
- 
+
 |  What:		/sys/class/kfd/topology/nodes/N/mem_banks_count
 |  Date:		May 2018
 |  KernelVersion:	4.13
 |  Description:	This field gives the Number of discoverable memory bank affinity properties on this "H-NUMA" node
- 
+
 |  What:		/sys/class/kfd/topology/nodes/N/caches_count
 |  Date:		May 2018
 |  KernelVersion:	4.13
 |  Description:	Gives the Number of discoverable cache affinity properties on the "H-NUMA" node.
- 
+
 |  What:		/sys/class/kfd/topology/nodes/N/io_links_count
 |  Date:		May 2018
 |  KernelVersion:	4.13
 |  Description:	This field gives the number of discoverable IO link affinity properties of this node connecting to other nodes.
- 
+
 |  What:		/sys/class/kfd/topology/nodes/N/cpu_cores_id
 |  Date:		May 2018
 |  KernelVersion:	4.13
 |  Description:	Gives the CPU core id details corresponding to core count
- 
+
 |  What:		/sys/class/kfd/topology/nodes/N/simd_id_base
 |  Date:		May 2018
 |  KernelVersion:	4.13
 |  Description:	This field gives simd id value.
- 
+
 |  What:		/sys/class/kfd/topology/nodes/N/max_waves_per_simd
-|  Date:		May 2018 
+|  Date:		May 2018
 |  KernelVersion:	4.13
 |  Description:	This identifies the maximum number of launched waves per SIMD. If NUmSIMDCores is 0, this value is ignored
- 
+
 |  What:		/sys/class/kfd/topology/nodes/N/gds_size_in_kb
 |  Date:		May 2018
 |  KernelVersion:	4.13
 |  Description:	This field gives the size of Global Data Store in Kilobytes shared across SIMD Wavefronts, typically 32 or 64
- 
+
 |  What:		/sys/class/kfd/topology/nodes/N/wave_front_size
 |  Date:		May 2018
 |  KernelVersion:	4.13
 |  Description:	wavefront is group of threads (work-item) that execute together for executing kernels and this field gives the size of the wavefront used. Usually 64or 32 or a different value for some HSA based architectures
- 
+
 |  What:		/sys/class/kfd/topology/nodes/N/array_count
 |  Date:		May 2018
 |  KernelVersion:	4.13
 |  Description:	This field give Number of SIMD Arrays per Engine
- 
+
 |  What:		/sys/class/kfd/topology/nodes/N/simd_arrays_per_engine
 |  Date:		May 2018
 |  KernelVersion:	4.13
 |  Description:	It gives the simd array count for every compute unite (stream engine)
-| 
+|
 |  What:		/sys/class/kfd/topology/nodes/N/cu_per_simd_array
 |  Date:		May 2018
 |  KernelVersion:	4.13
 |  Description:	Gives the Number of Compute Units (CU) per SIMD Array
-| 
+|
 |  What:		/sys/class/kfd/topology/nodes/N/simd_per_cu
 |  Date:		May 2018
-|  KernelVersion:	4.13 
+|  KernelVersion:	4.13
 |  Description:	Number of SIMD representing a Compute Unit (CU)
-| 
+|
 |  What:		/sys/class/kfd/topology/nodes/N/max_slots_scratch_cu
 |  Date:		May 2018
 |  KernelVersion:	4.13
 |  Description:	Bitmask of available CU slots, used for CU mask setup for the queues if assignment is desired by application necessary.
-| 
+|
 |  What:		/sys/class/kfd/topology/nodes/N/vendor_id
 |  Date:		May 2018
 |  KernelVersion:	4.13
 |  Description:	This field contains the GPU vendor id; 0 on CPU-only nodes
-|  
+|
 |  What:		/sys/class/kfd/topology/nodes/N/device_id
 |  Date:		May 2018
 |  KernelVersion:	4.13
 |  Description:	This field contains the  GPU device id; 0 on CPU-only nodes
-| 
+|
 |  What:		/sys/class/kfd/topology/nodes/N/location_id
 |  Date:		May 2018
 |  KernelVersion:	4.13
 |  Description:	LocationId, 32bit value, equivalent to BDF_ID used by Linux tools especially (identifies device in the overall
 system)
-| 
+|
 |  What:		/sys/class/kfd/topology/nodes/N/drm_render_minor
 |  Date:		May 2018
 |  KernelVersion:	4.13
-|  Description:	drm (Direct Rendering Manager) render data count is shown  
-| 
+|  Description:	drm (Direct Rendering Manager) render data count is shown
+|
 |  What:		/sys/class/kfd/topology/nodes/N/max_engine_clk_ccompute
 |  Date:		May 2018
 |  KernelVersion:	4.13
 |  Description:	Maximum engine clock speed of the CPU
-| 
+|
 
diff --git a/ROCm_Tools/HCC-Native-GCN-ISA.rst b/ROCm_Tools/HCC-Native-GCN-ISA.rst
index bd8c14d0..fff16500 100644
--- a/ROCm_Tools/HCC-Native-GCN-ISA.rst
+++ b/ROCm_Tools/HCC-Native-GCN-ISA.rst
@@ -41,7 +41,7 @@ Then install all other dependencies in order to build HCC from source:
 ::
 
   sudo apt-get install cmake git libelf-dev libc++abi-dev libc++-dev libdwarf-dev re2c libncurses5-dev patch wget file xz-utils       	libc6- dev-i386 python build-essential
-  
+
 **CMake**
 
 If you are using Ubuntu 14.04, you would also need to upgrade to a newer version (>=3.0) of CMake as the version distributed by the distro is old for building clang/llvm.
@@ -75,7 +75,7 @@ Install other development tools:
 ::
 
   sudo dnf groupinstall "Development Tools"
-  
+
 **libc++ & libc++abi**
 
 HCC has a dependency on libc++ and libc++abi; however, Fedora/RHEL/CentOS don't provide a working binary package so you will to build them from source by following the instructions `here <http://rocm-documentation.readthedocs.io/en/latest/ROCm_Tools/ROCm-Tools.html#hcc>`_
@@ -122,7 +122,7 @@ It is recommended to install the release_36 release of libc++ and libc++abi and
   sudo make install
   cd ../libcxxabi
   sudo make install
-  
+
 Add the libc++ and libc++abi installation path to the library search paths
 (i.e. export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib )
 
@@ -149,7 +149,7 @@ Fetch the source code
 ::
 
   repo sync
-  
+
 **Build Instructions**
 
 ::
@@ -182,7 +182,7 @@ You could also run the HCC's sanity test
 ::
 
   make test
-  
+
 **Install the Compiler**
 
 ::
diff --git a/ROCm_Tools/HCC_WIKI.rst b/ROCm_Tools/HCC_WIKI.rst
index 01b6af18..535e5b2b 100644
--- a/ROCm_Tools/HCC_WIKI.rst
+++ b/ROCm_Tools/HCC_WIKI.rst
@@ -1,12 +1,12 @@
 .. _HCCwiki:
- 
+
 HCC WIKI
 =========
 
 HCC is an Open Source, Optimizing C++ Compiler for Heterogeneous Compute
 **************************************************************************
 
-HCC supports heterogeneous offload to AMD APUs and discrete GPUs via HSA enabled runtimes and drivers. It is an ISO compliant C++ 11/14 compiler. It is based on Clang, the LLVM Compiler Infrastructure and the “libc++” C++ standard library.
+HCC supports heterogeneous offload to AMD APUs and discrete GPUs via HSA enabled runtimes and drivers. It is an ISO compliant C++ 11/14 compiler. It is based on Clang, the LLVM Compiler Infrastructure and the "libc++" C++ standard library.
 
 Accelerator Modes Supported
 *****************************
@@ -14,7 +14,7 @@ Accelerator Modes Supported
 `HC (Heterogeneous Compute) C++ API <https://scchan.github.io/hcc>`_
 ++++++++++++++++++++++++++++++++++++++++++
 
-Inspired by C++ AMP and C++17, this is the default C++ compute API for the HCC compiler. HC has some important differences from C++ AMP including removing the “restrict” keyword, supporting additional data types in kernels, providing more control over synchronization and data movement, and providing pointer-based memory allocation. It is designed to expose cutting edge compute capabilities on Boltzmann and HSA devices to developers while offering the productivity and usability of C++.
+Inspired by C++ AMP and C++17, this is the default C++ compute API for the HCC compiler. HC has some important differences from C++ AMP including removing the "restrict" keyword, supporting additional data types in kernels, providing more control over synchronization and data movement, and providing pointer-based memory allocation. It is designed to expose cutting edge compute capabilities on Boltzmann and HSA devices to developers while offering the productivity and usability of C++.
 
 `HIP <http://rocm-documentation.readthedocs.io/en/latest/Programming_Guides/hip-programming-guide.html>`_
 +++++++++++
@@ -73,9 +73,9 @@ Currently, HCC support for openSUSE is experimental and the compiler has to be b
 
 Building HCC from Source
 ########################
-First, install the build dependencies: 
+First, install the build dependencies:
 ::
-  
+
   # Ubuntu 16.04 & 18.04
   sudo apt-get install coreutils git cmake make g++  g++-multilib gcc-multilib python \
 findutils libelf1 libpci3 file debianutils libunwind-dev pkg-config \
@@ -99,7 +99,7 @@ hsa-rocr-dev hsa-ext-rocr-dev hsakmt-roct-dev rocm-utils
 
   # openSUSE Leap 42.3
   sudo zypper install coreutils git cmake make gcc-c++ python python-xml findutils elfutils pciutils-devel file rpm-build libunwind-devel pkg-config libpth-devel
-   
+
   # install libc++ from OSB
   sudo zypper addrepo \
   -f http://download.opensuse.org/repositories/devel:/tools:/compiler/openSUSE_Leap_42.3/ devel_tools_compiler
@@ -107,17 +107,17 @@ hsa-rocr-dev hsa-ext-rocr-dev hsakmt-roct-dev rocm-utils
   sudo zypper install libc++-devel
 
 
-Clone the HCC source tree: 
+Clone the HCC source tree:
 ::
   # automatically fetches all submodules
   git clone --recursive -b clang_tot_upgrade https://github.com/RadeonOpenCompute/hcc.git
 
-Create a build directory and run cmake to configure the build: 
+Create a build directory and run cmake to configure the build:
 ::
   mkdir build; cd build
   cmake ../hcc
 
-Compile HCC: 
+Compile HCC:
 ::
   make -j [number of threads]
 
@@ -125,8 +125,8 @@ Install HCC:
 ::
   sudo make install
 
-Run the unit tests: 
-:: 
+Run the unit tests:
+::
   make test
 
 Create an installer package (DEB or RPM file)
@@ -147,7 +147,7 @@ To compile and link in a single step:
 To build with separate compile and link steps:
 ::
  # Assume HCC is installed and added to PATH
- # Notice the the hcc-config command is between two backticks 
+ # Notice the the hcc-config command is between two backticks
  hcc -hc saxpy.cpp -c -o saxpy.cpp.o
  hcc -hc saxpy.cpp.o -o saxpy
 
@@ -158,7 +158,7 @@ By default, HCC would auto-detect all the GPUs available to run on and set the c
 
 ============ ================== ==============================================================
 GCN Version   GPU/APU Family       Examples of Radeon GPU
-       
+
 ============ ================== ==============================================================
 
 gfx803        GFX8               R9 Fury, R9 Fury X, R9 Nano, FirePro S9300 x2, Radeon RX 480,
@@ -166,7 +166,7 @@ gfx803        GFX8               R9 Fury, R9 Fury X, R9 Nano, FirePro S9300 x2,
 
 gfx900        GFX9                 Vega10
 
-============ ================== ============================================================== 
+============ ================== ==============================================================
 
 Required AMDGPU Attributes
 
diff --git a/ROCm_Tools/ROCm-Tools.rst b/ROCm_Tools/ROCm-Tools.rst
index 22f06d3f..61cd831b 100644
--- a/ROCm_Tools/ROCm-Tools.rst
+++ b/ROCm_Tools/ROCm-Tools.rst
@@ -1,4 +1,4 @@
-﻿
+
 .. _ROCm-Tools:
 
 =====================
@@ -24,16 +24,16 @@ GCN Assembler and Disassembler
 
 The Art of AMDGCN Assembly: How to Bend the Machine to Your Will
 *****************************************************************
-The ability to write code in assembly is essential to achieving the best performance for a GPU program. In a previous blog we described how to combine several languages in a single program using ROCm and Hsaco. This article explains how to produce Hsaco from assembly code and also takes a closer look at some new features of the GCN architecture. I'd like to thank Ilya Perminov of Luxsoft for co-authoring this blog post. Programs written for GPUs should achieve the highest performance possible. Even carefully written ones, however, won’t always employ 100% of the GPU’s capabilities. Some reasons are the following:
+The ability to write code in assembly is essential to achieving the best performance for a GPU program. In a previous blog we described how to combine several languages in a single program using ROCm and Hsaco. This article explains how to produce Hsaco from assembly code and also takes a closer look at some new features of the GCN architecture. I'd like to thank Ilya Perminov of Luxsoft for co-authoring this blog post. Programs written for GPUs should achieve the highest performance possible. Even carefully written ones, however, won't always employ 100% of the GPU's capabilities. Some reasons are the following:
 
  * The program may be written in a high level language that does not expose all of the features available on the hardware.
- * The compiler is unable to produce optimal ISA code, either because the compiler needs to ‘play it safe’ while adhering to the     	semantics of a language or because the compiler itself is generating un-optimized code.
+ * The compiler is unable to produce optimal ISA code, either because the compiler needs to 'play it safe' while adhering to the     	semantics of a language or because the compiler itself is generating un-optimized code.
 
-Consider a program that uses one of GCN’s new features (source code is available on `GitHub <https://github.com/RadeonOpenCompute/LLVM-AMDGPU-Assembler-Extra>`_). Recent hardware architecture updates—DPP and DS Permute instructions—enable efficient data sharing between wavefront lanes. To become more familiar with the instruction set, review the `GCN ISA Reference Guide <https://github.com/olvaffe/gpu-docs/blob/master/amd-open-gpu-docs/AMD_GCN3_Instruction_Set_Architecture.pdf>`_. Note: the assembler is currently experimental; some of syntax we describe may change.
+Consider a program that uses one of GCN's new features (source code is available on `GitHub <https://github.com/RadeonOpenCompute/LLVM-AMDGPU-Assembler-Extra>`_). Recent hardware architecture updates--DPP and DS Permute instructions--enable efficient data sharing between wavefront lanes. To become more familiar with the instruction set, review the `GCN ISA Reference Guide <https://github.com/olvaffe/gpu-docs/blob/master/amd-open-gpu-docs/AMD_GCN3_Instruction_Set_Architecture.pdf>`_. Note: the assembler is currently experimental; some of syntax we describe may change.
 
 DS Permute Instructions
 **************************
-Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to move between lanes on the basis of an index from another VGPR. These instructions use LDS hardware to route data between the 64 lanes, but they don’t write to LDS memory. The difference between them is what to index: the source-lane ID or the destination-lane ID. In other words, ds_permute_b32 says “put my lane data in lane i,” and ds_bpermute_b32 says “read data from lane i.” The GCN ISA Reference Guide provides a more formal description. The test kernel is simple: read the initial data and indices from memory into GPRs, do the permutation in the GPRs and write the data back to memory. An analogous OpenCL kernel would have this form:
+Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to move between lanes on the basis of an index from another VGPR. These instructions use LDS hardware to route data between the 64 lanes, but they don't write to LDS memory. The difference between them is what to index: the source-lane ID or the destination-lane ID. In other words, ds_permute_b32 says "put my lane data in lane i," and ds_bpermute_b32 says "read data from lane i." The GCN ISA Reference Guide provides a more formal description. The test kernel is simple: read the initial data and indices from memory into GPRs, do the permutation in the GPRs and write the data back to memory. An analogous OpenCL kernel would have this form:
 
 .. code:: cpp
 
@@ -45,7 +45,7 @@ Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to mov
 
 Passing Parameters to a Kernel
 *******************************
-Formal HSA arguments are passed to a kernel using a special read-only memory segment called kernarg. Before a wavefront starts, the base address of the kernarg segment is written to an SGPR pair. The memory layout of variables in kernarg must employ the same order as the list of kernel formal arguments, starting at offset 0, with no padding between variables—except to honor the requirements of natural alignment and any align qualifier. The example host program must create the kernarg segment and fill it with the buffer base addresses. The HSA host code might look like the following:
+Formal HSA arguments are passed to a kernel using a special read-only memory segment called kernarg. Before a wavefront starts, the base address of the kernarg segment is written to an SGPR pair. The memory layout of variables in kernarg must employ the same order as the list of kernel formal arguments, starting at offset 0, with no padding between variables--except to honor the requirements of natural alignment and any align qualifier. The example host program must create the kernarg segment and fill it with the buffer base addresses. The HSA host code might look like the following:
 
 .. code:: cpp
 
@@ -88,9 +88,9 @@ The host program should also allocate memory for the in, index and out buffers.
   out = AllocateBuffer(size);
 
   // Fill Kernarg memory
-  Kernarg(in); // Add base pointer to “in” buffer
-  Kernarg(index); // Append base pointer to “index” buffer
-  Kernarg(out); // Append base pointer to “out” buffer
+  Kernarg(in); // Add base pointer to "in" buffer
+  Kernarg(index); // Append base pointer to "index" buffer
+  Kernarg(out); // Append base pointer to "out" buffer
 
 Initial Wavefront and Register State To launch a kernel in real hardware, the run time needs information about the kernel, such as
 
@@ -144,7 +144,7 @@ Initial Wavefront and Register State To launch a kernel in real hardware, the ru
    flat_store_dword  v[3:4], v1
    s_endpgm
 
-Currently, a programmer must manually set all non-default values to provide the necessary information. Hopefully, this situation will change with new updates that bring automatic register counting and possibly a new syntax to fill that structure. Before the start of every wavefront execution, the GPU sets up the register state on the basis of the enable_sgpr_* and enable_vgpr_* flags. VGPR v0 is always initialized with a work-item ID in the x dimension. Registers v1 and v2 can be initialized with work-item IDs in the y and z dimensions, respectively. Scalar GPRs can be initialized with a work-group ID and work-group count in each dimension, a dispatch ID, and pointers to kernarg, the aql packet, the aql queue, and so on. Again, the AMDGPU-ABI specification contains a full list in in the section on initial register state. For this example, a 64-bit base kernarg address will be stored in the s[0:1] registers (enable_sgpr_kernarg_segment_ptr = 1), and the work-item thread ID will occupy v0 (by default). Below is the scheme showing initial state for our kernel. 
+Currently, a programmer must manually set all non-default values to provide the necessary information. Hopefully, this situation will change with new updates that bring automatic register counting and possibly a new syntax to fill that structure. Before the start of every wavefront execution, the GPU sets up the register state on the basis of the enable_sgpr_* and enable_vgpr_* flags. VGPR v0 is always initialized with a work-item ID in the x dimension. Registers v1 and v2 can be initialized with work-item IDs in the y and z dimensions, respectively. Scalar GPRs can be initialized with a work-group ID and work-group count in each dimension, a dispatch ID, and pointers to kernarg, the aql packet, the aql queue, and so on. Again, the AMDGPU-ABI specification contains a full list in in the section on initial register state. For this example, a 64-bit base kernarg address will be stored in the s[0:1] registers (enable_sgpr_kernarg_segment_ptr = 1), and the work-item thread ID will occupy v0 (by default). Below is the scheme showing initial state for our kernel.
 
 
 .. image:: initial_state-768x387.png
@@ -152,7 +152,7 @@ Currently, a programmer must manually set all non-default values to provide the
 
 The GPR Counting
 ******************
-The next amd_kernel_code_t fields are obvious: is_ptr64 = 1 says we are in 64-bit mode, and kernarg_segment_byte_size = 24 describes the kernarg segment size. The GPR counting is less straightforward, however. The workitem_vgpr_count holds the number of vector registers that each work item uses, and wavefront_sgpr_count holds the number of scalar registers that a wavefront uses. The code above employs v0–v4, so workitem_vgpr_count = 5. But wavefront_sgpr_count = 8 even though the code only shows s0–s5, since the special registers VCC, FLAT_SCRATCH and XNACK are physically stored as part of the wavefront’s SGPRs in the highest-numbered SGPRs. In this example, FLAT_SCRATCH and XNACK are disabled, so VCC has only two additional registers. In current GCN3 hardware, VGPRs are allocated in groups of 4 registers and SGPRs in groups of 16. Previous generations (GCN1 and GCN2) have a VGPR granularity of 4 registers and an SGPR granularity of 8 registers. The fields compute_pgm_rsrc1_*gprs contain a device-specific number for each register-block type to allocate for a wavefront. As we said previously, future updates may enable automatic counting, but for now you can use following formulas for all three GCN GPU generations:
+The next amd_kernel_code_t fields are obvious: is_ptr64 = 1 says we are in 64-bit mode, and kernarg_segment_byte_size = 24 describes the kernarg segment size. The GPR counting is less straightforward, however. The workitem_vgpr_count holds the number of vector registers that each work item uses, and wavefront_sgpr_count holds the number of scalar registers that a wavefront uses. The code above employs v0-v4, so workitem_vgpr_count = 5. But wavefront_sgpr_count = 8 even though the code only shows s0-s5, since the special registers VCC, FLAT_SCRATCH and XNACK are physically stored as part of the wavefront's SGPRs in the highest-numbered SGPRs. In this example, FLAT_SCRATCH and XNACK are disabled, so VCC has only two additional registers. In current GCN3 hardware, VGPRs are allocated in groups of 4 registers and SGPRs in groups of 16. Previous generations (GCN1 and GCN2) have a VGPR granularity of 4 registers and an SGPR granularity of 8 registers. The fields compute_pgm_rsrc1_*gprs contain a device-specific number for each register-block type to allocate for a wavefront. As we said previously, future updates may enable automatic counting, but for now you can use following formulas for all three GCN GPU generations:
 
 ::
 
@@ -388,7 +388,7 @@ rocprof
 2. Profiling Modes
 ******************
 
-‘rocprof’ can be used for GPU profiling using HW counters and
+'rocprof' can be used for GPU profiling using HW counters and
 application tracing
 
 2.1. GPU profiling
@@ -396,9 +396,9 @@ application tracing
 
 GPU profiling is controlled with input file which defines a list of
 metrics/counters and a profiling scope. An input file is provided using
-option ‘-i ’. Output CSV file with a line per submitted kernel is
+option '-i '. Output CSV file with a line per submitted kernel is
 generated. Each line has kernel name, kernel parameters and counter
-values. By option ‘—stats’ the kernel execution stats can be generated
+values. By option '--stats' the kernel execution stats can be generated
 in CSV format. Currently profiling has limitation of serializing
 submitted kernels. An example of input file:
 
@@ -414,17 +414,17 @@ submitted kernels. An example of input file:
       gpu: 0 1 2 3
       kernel: simple Pass1 simpleConvolutionPass2
 
-An example of profiling command line for ‘MatrixTranspose’ application
+An example of profiling command line for 'MatrixTranspose' application
 
 ::
 
    $ rocprof -i input.txt MatrixTranspose
-   RPL: on '191018_011134' from '/…./rocprofiler_pkg' in '/…./MatrixTranspose'
+   RPL: on '191018_011134' from '/..../rocprofiler_pkg' in '/..../MatrixTranspose'
    RPL: profiling '"./MatrixTranspose"'
    RPL: input file 'input.txt'
    RPL: output dir '/tmp/rpl_data_191018_011134_9695'
    RPL: result dir '/tmp/rpl_data_191018_011134_9695/input0_results_191018_011134'
-   ROCProfiler: rc-file '/…./rpl_rc.xml'
+   ROCProfiler: rc-file '/..../rpl_rc.xml'
    ROCProfiler: input from "/tmp/rpl_data_191018_011134_9695/input0.xml"
      gpu_index =
      kernel =
@@ -436,7 +436,7 @@ An example of profiling command line for ‘MatrixTranspose’ application
    PASSED!
 
    ROCProfiler: 1 contexts collected, output directory /tmp/rpl_data_191018_011134_9695/input0_results_191018_011134
-   RPL: '/…./MatrixTranspose/input.csv' is generated
+   RPL: '/..../MatrixTranspose/input.csv' is generated
 
 **2.1.1. Counters and metrics**
 
@@ -456,8 +456,8 @@ Metrics XML File Example:
 ::
 
    <gfx8>
-       <metric name=L1_CYCLES_COUNTER block=L1 event=0 descr=”L1 cache cycles”></metric>
-       <metric name=L1_MISS_COUNTER block=L1 event=33 descr=”L1 cache misses”></metric>
+       <metric name=L1_CYCLES_COUNTER block=L1 event=0 descr="L1 cache cycles"></metric>
+       <metric name=L1_MISS_COUNTER block=L1 event=33 descr="L1 cache misses"></metric>
        . . .
    </gfx8>
 
@@ -469,14 +469,14 @@ Metrics XML File Example:
      <metric
        name=L1_MISS_RATIO
        expr=L1_CYCLES_COUNT/L1_MISS_COUNTER
-       descry=”L1 miss rate metric”
+       descry="L1 miss rate metric"
      ></metric>
    </global>
 
 **2.1.1.1. Metrics query**
 
-Available counters and metrics can be queried by options ‘—list-basic’
-for counters and ‘—list-derived’ for derived metrics. The output for
+Available counters and metrics can be queried by options '--list-basic'
+for counters and '--list-derived' for derived metrics. The output for
 counters indicates number of block instances and number of block counter
 registers. The output for derived metrics prints the metrics
 expressions. Examples:
@@ -484,8 +484,8 @@ expressions. Examples:
 ::
 
    $ rocprof --list-basic
-   RPL: on '191018_014450' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose'
-   ROCProfiler: rc-file '/…./rpl_rc.xml'
+   RPL: on '191018_014450' from '/opt/rocm/rocprofiler' in '/..../MatrixTranspose'
+   ROCProfiler: rc-file '/..../rpl_rc.xml'
    Basic HW counters:
      gpu-agent0 : GRBM_COUNT : Tie High - Count Number of Clocks
          block GRBM has 2 counters
@@ -541,12 +541,12 @@ metric groups:
 ::
 
    $ rocprof -i input.txt ./MatrixTranspose
-   RPL: on '191018_032645' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose'
+   RPL: on '191018_032645' from '/opt/rocm/rocprofiler' in '/..../MatrixTranspose'
    RPL: profiling './MatrixTranspose'
    RPL: input file 'input.txt'
    RPL: output dir '/tmp/rpl_data_191018_032645_12106'
    RPL: result dir '/tmp/rpl_data_191018_032645_12106/input0_results_191018_032645'
-   ROCProfiler: rc-file '/…./rpl_rc.xml'
+   ROCProfiler: rc-file '/..../rpl_rc.xml'
    ROCProfiler: input from "/tmp/rpl_data_191018_032645_12106/input0.xml"
      gpu_index =
      kernel =
@@ -570,35 +570,35 @@ ________________________________
  - Collecting with multiple runs
 
 To collect several metric groups a full application replay is used by
-defining several ‘pmc:’ lines in the input file, see 2.1.
+defining several 'pmc:' lines in the input file, see 2.1.
 
 
 2.2. Application tracing
 ************************
 
 Supported application tracing includes runtime API and GPU activity
-tracing’ Supported runtimes are: ROCr (HSA API) and HIP Supported GPU
+tracing' Supported runtimes are: ROCr (HSA API) and HIP Supported GPU
 activity: kernel execution, async memory copy, barrier packets. The
 trace is generated in JSON format compatible with Chrome tracing. The
 trace consists of several sections with timelines for API trace per
 thread and GPU activity. The timelines events show event name and
-parameters. Supported options: ‘—hsa-trace’, ‘—hip-trace’, ‘—sys-trace’,
-where ‘sys trace’ is for HIP and HSA combined trace.
+parameters. Supported options: '--hsa-trace', '--hip-trace', '--sys-trace',
+where 'sys trace' is for HIP and HSA combined trace.
 
 **2.2.1. HIP runtime trace**
 
-The trace is generated by option ‘—hip-trace’ and includes HIP API
+The trace is generated by option '--hip-trace' and includes HIP API
 timelines and GPU activity at the runtime level.
 
 **2.2.2. ROCr runtime trace**
 
-The trace is generated by option ‘—hsa-trace’ and includes ROCr API
+The trace is generated by option '--hsa-trace' and includes ROCr API
 timelines and GPU activity at AQL queue level. Also, can provide
 counters per kernel.
 
 **2.2.3. KFD driver trace**
 
-The trace is generated by option ‘—kfd-trace’ and includes KFD Thunk API
+The trace is generated by option '--kfd-trace' and includes KFD Thunk API
 timelines.
 
 It is planned to include memory allocations/migration activity tracing.
@@ -606,7 +606,7 @@ It is planned to include memory allocations/migration activity tracing.
 **2.2.4. Code annotation**
 
 Support for application code annotation. Start/stop API is supported to
-programmatically control the profiling. A ‘roctx’ library provides
+programmatically control the profiling. A 'roctx' library provides
 annotation API. Annotation is visualized in JSON trace as a separate
 "Markers and Ranges" timeline section.
 
@@ -638,7 +638,7 @@ annotation API. Annotation is visualized in JSON trace as a separate
 
 **2.3. Multiple GPUs profiling**
 
-The profiler supports multiple GPU’s profiling and provide GPI id for
+The profiler supports multiple GPU's profiling and provide GPI id for
 counters and kernels data in CSV output file. Also, GPU id is indicating
 for respective GPU activity timeline in JSON trace.
 
@@ -707,7 +707,7 @@ Profiler errors are logged to global logs:
 4. 3rd party visualization tools
 ********************************
 
-‘rocprof’ is producing JSON trace compatible with Chrome Tracing, which
+'rocprof' is producing JSON trace compatible with Chrome Tracing, which
 is an internal trace visualization tool in Google Chrome.
 
 4.1. Chrome tracing
@@ -719,7 +719,7 @@ Good review can be found by the link:
 5. Command line options
 ***********************
 
-The command line options can be printed with option ‘-h’:
+The command line options can be printed with option '-h':
 
 ::
 
@@ -845,34 +845,34 @@ Counters:
 
 ::
 
-   •   GRBM_COUNT : Tie High - Count Number of Clocks
-   •   GRBM_GUI_ACTIVE : The GUI is Active
-   •   SQ_WAVES : Count number of waves sent to SQs. (per-simd, emulated, global)
-   •   SQ_INSTS_VALU : Number of VALU instructions issued. (per-simd, emulated)
-   •   SQ_INSTS_VMEM_WR : Number of VMEM write instructions issued (including FLAT). (per-simd, emulated)
-   •   SQ_INSTS_VMEM_RD : Number of VMEM read instructions issued (including FLAT). (per-simd, emulated)
-   •   SQ_INSTS_SALU : Number of SALU instructions issued. (per-simd, emulated)
-   •   SQ_INSTS_SMEM : Number of SMEM instructions issued. (per-simd, emulated)
-   •   SQ_INSTS_FLAT : Number of FLAT instructions issued. (per-simd, emulated)
-   •   SQ_INSTS_FLAT_LDS_ONLY : Number of FLAT instructions issued that read/wrote only from/to LDS (only works if EARLY_TA_DONE is enabled). (per-simd, emulated)
-   •   SQ_INSTS_LDS : Number of LDS instructions issued (including FLAT). (per-simd, emulated)
-   •   SQ_INSTS_GDS : Number of GDS instructions issued. (per-simd, emulated)
-   •   SQ_WAIT_INST_LDS : Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)
-   •   SQ_ACTIVE_INST_VALU : regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic)
-   •   SQ_INST_CYCLES_SALU : Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated)
-   •   SQ_THREAD_CYCLES_VALU : Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)
-   •   SQ_LDS_BANK_CONFLICT : Number of cycles LDS is stalled by bank conflicts. (emulated)
-   •   TA_TA_BUSY[0-15] : TA block is busy. Perf_Windowing not supported for this counter.
-   •   TA_FLAT_READ_WAVEFRONTS[0-15] : Number of flat opcode reads processed by the TA.
-   •   TA_FLAT_WRITE_WAVEFRONTS[0-15] : Number of flat opcode writes processed by the TA.
-   •   TCC_HIT[0-15] : Number of cache hits.
-   •   TCC_MISS[0-15] : Number of cache misses. UC reads count as misses.
-   •   TCC_EA_WRREQ[0-15] : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands.
-   •   TCC_EA_WRREQ_64B[0-15] : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface.
-   •   TCC_EA_WRREQ_STALL[0-15] : Number of cycles a write request was stalled.
-   •   TCC_EA_RDREQ[0-15] : Number of TCC/EA read requests (either 32-byte or 64-byte)
-   •   TCC_EA_RDREQ_32B[0-15] : Number of 32-byte TCC/EA read requests
-   •   TCP_TCP_TA_DATA_STALL_CYCLES[0-15] : TCP stalls TA data interface. Now Windowed.
+   o   GRBM_COUNT : Tie High - Count Number of Clocks
+   o   GRBM_GUI_ACTIVE : The GUI is Active
+   o   SQ_WAVES : Count number of waves sent to SQs. (per-simd, emulated, global)
+   o   SQ_INSTS_VALU : Number of VALU instructions issued. (per-simd, emulated)
+   o   SQ_INSTS_VMEM_WR : Number of VMEM write instructions issued (including FLAT). (per-simd, emulated)
+   o   SQ_INSTS_VMEM_RD : Number of VMEM read instructions issued (including FLAT). (per-simd, emulated)
+   o   SQ_INSTS_SALU : Number of SALU instructions issued. (per-simd, emulated)
+   o   SQ_INSTS_SMEM : Number of SMEM instructions issued. (per-simd, emulated)
+   o   SQ_INSTS_FLAT : Number of FLAT instructions issued. (per-simd, emulated)
+   o   SQ_INSTS_FLAT_LDS_ONLY : Number of FLAT instructions issued that read/wrote only from/to LDS (only works if EARLY_TA_DONE is enabled). (per-simd, emulated)
+   o   SQ_INSTS_LDS : Number of LDS instructions issued (including FLAT). (per-simd, emulated)
+   o   SQ_INSTS_GDS : Number of GDS instructions issued. (per-simd, emulated)
+   o   SQ_WAIT_INST_LDS : Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)
+   o   SQ_ACTIVE_INST_VALU : regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic)
+   o   SQ_INST_CYCLES_SALU : Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated)
+   o   SQ_THREAD_CYCLES_VALU : Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)
+   o   SQ_LDS_BANK_CONFLICT : Number of cycles LDS is stalled by bank conflicts. (emulated)
+   o   TA_TA_BUSY[0-15] : TA block is busy. Perf_Windowing not supported for this counter.
+   o   TA_FLAT_READ_WAVEFRONTS[0-15] : Number of flat opcode reads processed by the TA.
+   o   TA_FLAT_WRITE_WAVEFRONTS[0-15] : Number of flat opcode writes processed by the TA.
+   o   TCC_HIT[0-15] : Number of cache hits.
+   o   TCC_MISS[0-15] : Number of cache misses. UC reads count as misses.
+   o   TCC_EA_WRREQ[0-15] : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands.
+   o   TCC_EA_WRREQ_64B[0-15] : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface.
+   o   TCC_EA_WRREQ_STALL[0-15] : Number of cycles a write request was stalled.
+   o   TCC_EA_RDREQ[0-15] : Number of TCC/EA read requests (either 32-byte or 64-byte)
+   o   TCC_EA_RDREQ_32B[0-15] : Number of 32-byte TCC/EA read requests
+   o   TCP_TCP_TA_DATA_STALL_CYCLES[0-15] : TCP stalls TA data interface. Now Windowed.
 
 The following derived metrics have been defined and the profiler metrics
 XML specification can be found at:
@@ -882,44 +882,44 @@ Metrics:
 
 ::
 
-   •   TA_BUSY_avr : TA block is busy. Average over TA instances.
-   •   TA_BUSY_max : TA block is busy. Max over TA instances.
-   •   TA_BUSY_min : TA block is busy. Min over TA instances.
-   •   TA_FLAT_READ_WAVEFRONTS_sum : Number of flat opcode reads processed by the TA. Sum over TA instances.
-   •   TA_FLAT_WRITE_WAVEFRONTS_sum : Number of flat opcode writes processed by the TA. Sum over TA instances.
-   •   TCC_HIT_sum : Number of cache hits. Sum over TCC instances.
-   •   TCC_MISS_sum : Number of cache misses. Sum over TCC instances.
-   •   TCC_EA_RDREQ_32B_sum : Number of 32-byte TCC/EA read requests. Sum over TCC instances.
-   •   TCC_EA_RDREQ_sum : Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances.
-   •   TCC_EA_WRREQ_sum : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC instances.
-   •   TCC_EA_WRREQ_64B_sum : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC instances.
-   •   TCC_WRREQ_STALL_max : Number of cycles a write request was stalled. Max over TCC instances.
-   •   TCC_MC_WRREQ_sum : Number of 32-byte effective writes. Sum over TCC instaces.
-   •   FETCH_SIZE : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
-   •   WRITE_SIZE : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
-   •   GPUBusy : The percentage of time GPU was busy.
-   •   Wavefronts : Total wavefronts.
-   •   VALUInsts : The average number of vector ALU instructions executed per work-item (affected by flow control).
-   •   SALUInsts : The average number of scalar ALU instructions executed per work-item (affected by flow control).
-   •   VFetchInsts : The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory.
-   •   SFetchInsts : The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control).
-   •   VWriteInsts : The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory.
-   •   FlatVMemInsts : The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch.
-   •   LDSInsts : The average number of LDS read or LDS write instructions executed per work item (affected by flow control).  Excludes FLAT instructions that read from or write to LDS.
-   •   FlatLDSInsts : The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control).
-   •   GDSInsts : The average number of GDS read or GDS write instructions executed per work item (affected by flow control).
-   •   VALUUtilization : The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence).
-   •   VALUBusy : The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).
-   •   SALUBusy : The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).
-   •   Mem32Bwrites :
-   •   FetchSize : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
-   •   WriteSize : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
-   •   L2CacheHit : The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal).
-   •   MemUnitBusy : The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound).
-   •   MemUnitStalled : The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad).
-   •   WriteUnitStalled : The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad).
-   •   ALUStalledByLDS : The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad).
-   •   LDSBankConflict : The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad).
+   o   TA_BUSY_avr : TA block is busy. Average over TA instances.
+   o   TA_BUSY_max : TA block is busy. Max over TA instances.
+   o   TA_BUSY_min : TA block is busy. Min over TA instances.
+   o   TA_FLAT_READ_WAVEFRONTS_sum : Number of flat opcode reads processed by the TA. Sum over TA instances.
+   o   TA_FLAT_WRITE_WAVEFRONTS_sum : Number of flat opcode writes processed by the TA. Sum over TA instances.
+   o   TCC_HIT_sum : Number of cache hits. Sum over TCC instances.
+   o   TCC_MISS_sum : Number of cache misses. Sum over TCC instances.
+   o   TCC_EA_RDREQ_32B_sum : Number of 32-byte TCC/EA read requests. Sum over TCC instances.
+   o   TCC_EA_RDREQ_sum : Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances.
+   o   TCC_EA_WRREQ_sum : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC instances.
+   o   TCC_EA_WRREQ_64B_sum : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC instances.
+   o   TCC_WRREQ_STALL_max : Number of cycles a write request was stalled. Max over TCC instances.
+   o   TCC_MC_WRREQ_sum : Number of 32-byte effective writes. Sum over TCC instaces.
+   o   FETCH_SIZE : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
+   o   WRITE_SIZE : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
+   o   GPUBusy : The percentage of time GPU was busy.
+   o   Wavefronts : Total wavefronts.
+   o   VALUInsts : The average number of vector ALU instructions executed per work-item (affected by flow control).
+   o   SALUInsts : The average number of scalar ALU instructions executed per work-item (affected by flow control).
+   o   VFetchInsts : The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory.
+   o   SFetchInsts : The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control).
+   o   VWriteInsts : The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory.
+   o   FlatVMemInsts : The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch.
+   o   LDSInsts : The average number of LDS read or LDS write instructions executed per work item (affected by flow control).  Excludes FLAT instructions that read from or write to LDS.
+   o   FlatLDSInsts : The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control).
+   o   GDSInsts : The average number of GDS read or GDS write instructions executed per work item (affected by flow control).
+   o   VALUUtilization : The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence).
+   o   VALUBusy : The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).
+   o   SALUBusy : The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).
+   o   Mem32Bwrites :
+   o   FetchSize : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
+   o   WriteSize : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
+   o   L2CacheHit : The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal).
+   o   MemUnitBusy : The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound).
+   o   MemUnitStalled : The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad).
+   o   WriteUnitStalled : The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad).
+   o   ALUStalledByLDS : The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad).
+   o   LDSBankConflict : The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad).
 
 
 ROC Profiler
@@ -1034,7 +1034,7 @@ GitHub: `https://github.com/ROCm-Developer-Tools/roctracer <https://github.com/R
 To clone ROC Tracer from GitHub:
 
 .. code:: sh
-  
+
   git clone -b amd-master https://github.com/ROCm-Developer-Tools/roctracer
 
   The library source tree:
@@ -1050,7 +1050,7 @@ To clone ROC Tracer from GitHub:
 **Build and run test**
 
 .. code:: sh
-  
+
   - Python is required
     The required modules: CppHeaderParser, argparse.
     To install:
@@ -1068,7 +1068,7 @@ To clone ROC Tracer from GitHub:
   - To build and run test
    make mytest
    run.sh
-  
+
   - To install
    make install
    or
@@ -1108,7 +1108,7 @@ AOMP will install to /usr/lib/aomp. The AOMP environment variable will automatic
 On Ubuntu 18.04 LTS (bionic beaver), run these commands:
 
 ::
-  
+
   wget https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_0.7-5/aomp_Ubuntu1804_0.7-5_amd64.deb
   sudo dpkg -i aomp_Ubuntu1804_0.7-5_amd64.deb
 
@@ -1149,8 +1149,8 @@ If you build AOMP with support for nvptx GPUs, you must first install CUDA 10. N
 **Download Instructions for CUDA (Ubuntu 16.04)**
 
     Go to https://developer.nvidia.com/cuda-10.0-download-archive
-    For Ubuntu 16.04, select Linux®, x86_64, Ubuntu, 16.04, deb(local) and then click Download. Note you can change these options for your specific distribution type.
-    Navigate to the debian in your Linux® directory and run the following commands:
+    For Ubuntu 16.04, select Linux(R), x86_64, Ubuntu, 16.04, deb(local) and then click Download. Note you can change these options for your specific distribution type.
+    Navigate to the debian in your Linux(R) directory and run the following commands:
 
 ::
 
@@ -1211,8 +1211,8 @@ If you build AOMP with support for nvptx GPUs, you must first install CUDA 10.
 Download Instructions for CUDA (SLES15)
 
     Go to https://developer.nvidia.com/cuda-10.0-download-archive
-    For SLES-15, select Linux®, x86_64, SLES, 15.0, rpm(local) and then click Download.
-    Navigate to the rpm in your Linux® directory and run the following commands:
+    For SLES-15, select Linux(R), x86_64, SLES, 15.0, rpm(local) and then click Download.
+    Navigate to the rpm in your Linux(R) directory and run the following commands:
 
 ::
 
@@ -1321,8 +1321,8 @@ To build AOMP with support for nvptx GPUs, you must first install CUDA 10. We re
 **Download Instructions for CUDA (CentOS/RHEL 7)**
 
     * Go to https://developer.nvidia.com/cuda-10.0-download-archive
-    * For SLES-15, select Linux®, x86_64, RHEL or CentOS, 7, rpm(local) and then click Download.
-    * Navigate to the rpm in your Linux® directory and run the following commands:
+    * For SLES-15, select Linux(R), x86_64, RHEL or CentOS, 7, rpm(local) and then click Download.
+    * Navigate to the rpm in your Linux(R) directory and run the following commands:
 
 ::
 
@@ -1369,7 +1369,7 @@ To install the rpm package without root access into your home directory, you can
 
 ::
 
-   mkdir /tmp/temproot ; cd /tmp/temproot 
+   mkdir /tmp/temproot ; cd /tmp/temproot
    wget https://github.com/ROCm-Developer-Tools/aomp/releases/download/rel_0.7-5/aomp_SLES15_SP1-0.7-5.x86_64.rpm
    rpm2cpio aomp_SLES15_SP1-0.7-5.x86_64.rpm | cpio -idmv
    mv /tmp/temproot/usr/lib $HOME
@@ -1722,29 +1722,29 @@ AOMP Limitations
 See the `release notes <https://github.com/ROCm-Developer-Tools/aomp/releases>`_ in github. Here are some limitations.
 
  - Dwarf debugging is turned off for GPUs. -g will turn on host level debugging only.
- - Some simd constructs fail to vectorize on both host and GPUs.  
+ - Some simd constructs fail to vectorize on both host and GPUs.
 
 ROCmValidationSuite
 =====================
 
-The ROCm Validation Suite (RVS) is a system administrator’s and cluster manager's tool for detecting and troubleshooting common problems affecting AMD GPU(s) running in a high-performance computing environment, enabled using the ROCm software stack on a compatible platform.
+The ROCm Validation Suite (RVS) is a system administrator's and cluster manager's tool for detecting and troubleshooting common problems affecting AMD GPU(s) running in a high-performance computing environment, enabled using the ROCm software stack on a compatible platform.
 
-The RVS is a collection of tests, benchmarks and qualification tools each targeting a specific sub-system of the ROCm platform. All of the tools are implemented in software and share a common command line interface. Each set of tests are implemented in a “module” which is a library encapsulating the functionality specific to the tool. The CLI can specify the directory containing modules to use when searching for libraries to load. Each module may have a set of options that it defines and a configuration file that supports its execution.
+The RVS is a collection of tests, benchmarks and qualification tools each targeting a specific sub-system of the ROCm platform. All of the tools are implemented in software and share a common command line interface. Each set of tests are implemented in a "module" which is a library encapsulating the functionality specific to the tool. The CLI can specify the directory containing modules to use when searching for libraries to load. Each module may have a set of options that it defines and a configuration file that supports its execution.
 
 ROCmValidationSuite Modules
 ******************************
 
-**GPU Properties – GPUP**
+**GPU Properties - GPUP**
 
-The GPU Properties module queries the configuration of a target device and returns the device’s static characteristics. These static values can be used to debug issues such as device support, performance and firmware problems.
+The GPU Properties module queries the configuration of a target device and returns the device's static characteristics. These static values can be used to debug issues such as device support, performance and firmware problems.
 
-**GPU Monitor – GM module**
+**GPU Monitor - GM module**
 
 The GPU monitor tool is capable of running on one, some or all of the GPU(s) installed and will report various information at regular intervals. The module can be configured to halt another RVS modules execution if one of the quantities exceeds a specified boundary value.
 
-**PCI Express State Monitor – PESM module?**
+**PCI Express State Monitor - PESM module?**
 
-The PCIe State Monitor tool is used to actively monitor the PCIe interconnect between the host platform and the GPU. The module will register a “listener” on a target GPU’s PCIe interconnect, and log a message whenever it detects a state change. The PESM will be able to detect the following state changes:
+The PCIe State Monitor tool is used to actively monitor the PCIe interconnect between the host platform and the GPU. The module will register a "listener" on a target GPU's PCIe interconnect, and log a message whenever it detects a state change. The PESM will be able to detect the following state changes:
 
     * PCIe link speed changes
     * GPU power state changes
@@ -1754,12 +1754,12 @@ The PCIe State Monitor tool is used to actively monitor the PCIe interconnect be
 The ROCm Configuration Qualification Tool ensures the platform is capable of running ROCm applications and is configured correctly. It checks the installed versions of the ROCm components and the platform configuration of the system. This includes checking that dependencies, corresponding to the associated operating system and runtime environment, are installed correctly. Other qualification steps include checking:
 
     * The existence of the /dev/kfd device
-    * The /dev/kfd device’s permissions
+    * The /dev/kfd device's permissions
     * The existence of all required users and groups that support ROCm
     * That the user mode components are compatible with the drivers, both the KFD and the amdgpu driver.
     * The configuration of the runtime linker/loader qualifying that all ROCm libraries are in the correct search path.
 
-**PCI Express Qualification Tool – PEQT module**
+**PCI Express Qualification Tool - PEQT module**
 
 The PCIe Qualification Tool consists is used to qualify the PCIe bus on which the GPU is connected. The qualification test will be capable of determining the following characteristics of the PCIe bus interconnect to a GPU:
 
@@ -1768,21 +1768,21 @@ The PCIe Qualification Tool consists is used to qualify the PCIe bus on which th
     * PCIe link speed
     * PCIe link width
 
-**SBIOS Mapping Qualification Tool – SMQT module**
+**SBIOS Mapping Qualification Tool - SMQT module**
 
-The GPU SBIOS mapping qualification tool is designed to verify that a platform’s SBIOS has satisfied the BAR mapping requirements for VDI and Radeon Instinct products for ROCm support.
+The GPU SBIOS mapping qualification tool is designed to verify that a platform's SBIOS has satisfied the BAR mapping requirements for VDI and Radeon Instinct products for ROCm support.
 
-Refer to the “ROCm Use of Advanced PCIe Features and Overview of How BAR Memory is Used In ROCm Enabled System” web page for more information about how BAR memory is initialized by VDI and Radeon products.
+Refer to the "ROCm Use of Advanced PCIe Features and Overview of How BAR Memory is Used In ROCm Enabled System" web page for more information about how BAR memory is initialized by VDI and Radeon products.
 
-**P2P Benchmark and Qualification Tool – PBQT module**
+**P2P Benchmark and Qualification Tool - PBQT module**
 
 The P2P Benchmark and Qualification Tool is designed to provide the list of all GPUs that support P2P and characterize the P2P links between peers. In addition to testing for P2P compatibility, this test will perform a peer-to-peer throughput test between all P2P pairs for performance evaluation. The P2P Benchmark and Qualification Tool will allow users to pick a collection of two or more GPUs on which to run. The user will also be able to select whether or not they want to run the throughput test on each of the pairs.
 
-Please see the web page “ROCm, a New Era in Open GPU Computing” to find out more about the P2P solutions available in a ROCm environment.
+Please see the web page "ROCm, a New Era in Open GPU Computing" to find out more about the P2P solutions available in a ROCm environment.
 
-**PCI Express Bandwidth Benchmark – PEBB module**
+**PCI Express Bandwidth Benchmark - PEBB module**
 
-The PCIe Bandwidth Benchmark attempts to saturate the PCIe bus with DMA transfers between system memory and a target GPU card’s memory. The maximum bandwidth obtained is reported to help debug low bandwidth issues. The benchmark should be capable of targeting one, some or all of the GPUs installed in a platform, reporting individual benchmark statistics for each.
+The PCIe Bandwidth Benchmark attempts to saturate the PCIe bus with DMA transfers between system memory and a target GPU card's memory. The maximum bandwidth obtained is reported to help debug low bandwidth issues. The benchmark should be capable of targeting one, some or all of the GPUs installed in a platform, reporting individual benchmark statistics for each.
 
 **GPU Stress Test - GST module**
 
@@ -1809,16 +1809,16 @@ CentOS :
 
 ::
 
-    sudo yum install -y cmake3 doxygen pciutils-devel rpm rpm-build git gcc-c++ 
+    sudo yum install -y cmake3 doxygen pciutils-devel rpm rpm-build git gcc-c++
 
 RHEL :
 
 ::
 
-  sudo yum install -y cmake3 doxygen rpm rpm-build git gcc-c++ 
-    
+  sudo yum install -y cmake3 doxygen rpm rpm-build git gcc-c++
+
   wget http://mirror.centos.org/centos/7/os/x86_64/Packages/pciutils-devel-3.5.1-3.el7.x86_64.rpm
-    
+
   sudo rpm -ivh pciutils-devel-3.5.1-3.el7.x86_64.rpm
 
 SLES :
@@ -1826,10 +1826,10 @@ SLES :
 ::
 
   sudo SUSEConnect -p sle-module-desktop-applications/15.1/x86_64
-   
+
   sudo SUSEConnect --product sle-module-development-tools/15.1/x86_64
-   
-  sudo zypper  install -y cmake doxygen pciutils-devel libpci3 rpm git rpm-build gcc-c++ 
+
+  sudo zypper  install -y cmake doxygen pciutils-devel libpci3 rpm git rpm-build gcc-c++
 
 Install ROCm stack, rocblas and rocm_smi64
 *********************************************
@@ -1866,7 +1866,7 @@ CentOS & RHEL : sudo rpm -e rocm_smi64 && sudo yum install rocm_smi64
 SUSE : sudo rpm -e rocm_smi64 && sudo zypper install rocm_smi64
 
 Building from Source
-********************** 
+**********************
 
 This section explains how to get and compile current development stream of RVS.
 
@@ -1889,7 +1889,7 @@ If OS is Ubuntu and SLES, use cmake
 ::
 
   cmake ./ -B./build
- 
+
   make -C ./build
 
 
@@ -1903,11 +1903,11 @@ If OS is CentOS and RHEL, use cmake3
 
 
 Build package:
- 
+
 ::
- 
+
   cd ./build
- 
+
   make package
 
 Note:_ based on your OS, only DEB or RPM package will be built. You may ignore an error for the unrelated configuration
@@ -2223,7 +2223,7 @@ MIVisionX
   :alt: MIVisionX
   :target: https://gpuopen-professionalcompute-libraries.github.io/MIVisionX/
 
-MIVisionX toolkit is a set of comprehensive computer vision and machine intelligence libraries, utilities, and applications bundled into a single toolkit. AMD MIVisionX delivers highly optimized open source implementation of the `Khronos OpenVX™ <https://www.khronos.org/openvx/>`_ and OpenVX™ Extensions along with Convolution Neural Net Model Compiler & Optimizer supporting `ONNX <https://onnx.ai/>`_, and `Khronos NNEF™ <https://www.khronos.org/nnef>`_ exchange formats. The toolkit allows for rapid prototyping and deployment of optimized workloads on a wide range of computer hardware, including small embedded x86 CPUs, APUs, discrete GPUs, and heterogeneous servers.
+MIVisionX toolkit is a set of comprehensive computer vision and machine intelligence libraries, utilities, and applications bundled into a single toolkit. AMD MIVisionX delivers highly optimized open source implementation of the `Khronos OpenVX(TM) <https://www.khronos.org/openvx/>`_ and OpenVX(TM) Extensions along with Convolution Neural Net Model Compiler & Optimizer supporting `ONNX <https://onnx.ai/>`_, and `Khronos NNEF(TM) <https://www.khronos.org/nnef>`_ exchange formats. The toolkit allows for rapid prototyping and deployment of optimized workloads on a wide range of computer hardware, including small embedded x86 CPUs, APUs, discrete GPUs, and heterogeneous servers.
 
 * `AMD OpenVX <https://gpuopen-professionalcompute-libraries.github.io/MIVisionX/#amd-openvx>`_
 * `AMD OpenVX Extensions <https://gpuopen-professionalcompute-libraries.github.io/MIVisionX/#amd-openvx-extensions>`_
@@ -2366,7 +2366,7 @@ Using live camera
 usage:
 
 ::
- 
+
   runvx -frames:live canny-LIVE.gdf
 
 **OpenCV_orb-LIVE.gdf**
@@ -2432,10 +2432,10 @@ Prerequisites
 
 Pre-requisites setup script - MIVisionX-setup.py
 ************************************************
- 
+
 For the convenience of the developer, we here provide the setup script which will install all the dependencies required by this project.
 
-**MIVisionX-setup.py**- This scipts builds all the prerequisites required by MIVisionX. The setup script creates a deps folder and installs all the prerequisites, this script only needs to be executed once. If -d option for directory is not given the script will install deps folder in ‘~/’ directory by default, else in the user specified folder.
+**MIVisionX-setup.py**- This scipts builds all the prerequisites required by MIVisionX. The setup script creates a deps folder and installs all the prerequisites, this script only needs to be executed once. If -d option for directory is not given the script will install deps folder in '~/' directory by default, else in the user specified folder.
 
 **Prerequisites for running the scripts**
 
@@ -2530,7 +2530,7 @@ Build & Install MIVisionX
                             --installer [Package management tool - optional (default:apt-get) [options: Ubuntu:apt-get;CentOS:yum]]
                             --miopen    [MIOpen Version - optional (default:2.1.0)]
                             --miopengemm[MIOpenGEMM Version - optional (default:1.1.5)]
-                            --ffmpeg    [FFMPEG Installation - optional (default:no) [options:Install ffmpeg - yes]]    
+                            --ffmpeg    [FFMPEG Installation - optional (default:no) [options:Install ffmpeg - yes]]
                             --rpp       [RPP Installation - optional (default:yes) [options:yes/no]]
 
 
@@ -2558,7 +2558,7 @@ Build & Install MIVisionX
    * git clone, build and install other ROCm projects (using cmake and % make install) in the below order for vx_nn.
        * `rocm-cmake <https://github.com/RadeonOpenCompute/rocm-cmake>`_
        * `MIOpenGEMM <https://github.com/ROCmSoftwarePlatform/MIOpenGEMM>`_
-       * `MIOpen <https://github.com/ROCmSoftwarePlatform/MIOpen>`_ – make sure to use -DMIOPEN_BACKEND=OpenCL option with cmake
+       * `MIOpen <https://github.com/ROCmSoftwarePlatform/MIOpen>`_ - make sure to use -DMIOPEN_BACKEND=OpenCL option with cmake
    * install `protobuf <https://github.com/protocolbuffers/protobuf/releases/tag/v3.5.2>`__
    * install `OpenCV <https://github.com/opencv/opencv/releases/tag/3.3.0>`__
    * install `FFMPEG n4.0.4 <https://github.com/FFmpeg/FFmpeg/releases/tag/n4.0.4>`_ - Optional
@@ -2583,18 +2583,18 @@ Verify the Installation
     * Apps, Samples, Documents, Model Compiler and Toolkit are placed into /opt/rocm/mivisionx
 
     * Run samples to verify the installation
-        
+
         * **Canny Edge Detection**
- 
+
 .. image:: https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX/blob/master/samples/images/canny_image.PNG?raw=true
   :align: center
   :width: 600
-    
+
 ::
 
   export PATH=$PATH:/opt/rocm/mivisionx/bin
   export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/mivisionx/lib
-  runvx /opt/rocm/mivisionx/samples/gdf/canny.gdf 
+  runvx /opt/rocm/mivisionx/samples/gdf/canny.gdf
 
 Note: More samples are available `here <https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX/blob/1.3.0/samples#samples>`_
 
@@ -2666,13 +2666,13 @@ MIVisionX provides developers with docker images for Ubuntu 16.04, Ubuntu 18.04,
 
 
 
-* Optional: Map localhost directory on the docker image 
+* Optional: Map localhost directory on the docker image
       * option to map the localhost directory with trained caffe models to be accessed on the docker image.
       * usage: -v {LOCAL_HOST_DIRECTORY_PATH}:{DOCKER_DIRECTORY_PATH}
- 
-       
+
+
 ::
-     
+
      sudo docker run -it -v /home/:/root/hostDrive/ --device=/dev/kfd --device=/dev/dri --cap-add=SYS_RAWIO --device=/dev/mem --group-add video --network host mivisionx/ubuntu-16.04
 
 
@@ -2680,24 +2680,24 @@ MIVisionX provides developers with docker images for Ubuntu 16.04, Ubuntu 18.04,
 **Note: Display option with docker**
 
     * Using host display
-     
+
 ::
- 
+
      xhost +local:root
-     sudo docker run -it --device=/dev/kfd --device=/dev/dri --cap-add=SYS_RAWIO --device=/dev/mem --group-add video 
-     --network host --env DISPLAY=unix$DISPLAY --privileged --volume $XAUTH:/root/.Xauthority 
+     sudo docker run -it --device=/dev/kfd --device=/dev/dri --cap-add=SYS_RAWIO --device=/dev/mem --group-add video
+     --network host --env DISPLAY=unix$DISPLAY --privileged --volume $XAUTH:/root/.Xauthority
      --volume /tmp/.X11-unix/:/tmp/.X11-unix mivisionx/ubuntu-16.04:latest
 
 
 
 * Test display with MIVisionX sample
 
-    
+
 ::
 
     export PATH=$PATH:/opt/rocm/mivisionx/bin
     export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/mivisionx/lib
-    runvx /opt/rocm/mivisionx/samples/gdf/canny.gdf 
+    runvx /opt/rocm/mivisionx/samples/gdf/canny.gdf
 
 Release Notes
 *************
@@ -2705,7 +2705,7 @@ Release Notes
 **Known issues**
 
    * Package (.deb & .rpm) install requires OpenCV v3.4.0 to execute AMD OpenCV extensions
-   
+
 
 **Tested configurations**
 
diff --git a/ROCm_Tools/clBLA.rst b/ROCm_Tools/clBLA.rst
index 687c38ed..6e9c487e 100644
--- a/ROCm_Tools/clBLA.rst
+++ b/ROCm_Tools/clBLA.rst
@@ -7,7 +7,7 @@ clBLAS
 
 For Github repository `clBLAS <https://github.com/clMathLibraries/clBLAS>`_
 
-This repository houses the code for the OpenCL™ BLAS portion of clMath. The complete set of BLAS level 1, 2 & 3 routines is implemented. Please see Netlib BLAS for the list of supported routines. In addition to GPU devices, the library also supports running on CPU devices to facilitate debugging and multicore programming. APPML 1.12 is the most current generally available pre-packaged binary version of the library available for download for both Linux and Windows platforms.
+This repository houses the code for the OpenCL(TM) BLAS portion of clMath. The complete set of BLAS level 1, 2 & 3 routines is implemented. Please see Netlib BLAS for the list of supported routines. In addition to GPU devices, the library also supports running on CPU devices to facilitate debugging and multicore programming. APPML 1.12 is the most current generally available pre-packaged binary version of the library available for download for both Linux and Windows platforms.
 
 The primary goal of clBLAS is to make it easier for developers to utilize the inherent performance and power efficiency benefits of heterogeneous computing. clBLAS interfaces do not hide nor wrap OpenCL interfaces, but rather leaves OpenCL state management to the control of the user to allow for maximum performance and flexibility. The clBLAS library does generate and enqueue optimized OpenCL kernels, relieving the user from the task of writing, optimizing and maintaining kernel code themselves.
 
@@ -166,7 +166,7 @@ Build dependencies
 ********************
 **Library for Windows**
 
- * Windows® 7/8
+ * Windows(R) 7/8
  * Visual Studio 2010 SP1, 2012
  * An OpenCL SDK, such as APP SDK 2.8
  * Latest CMake
diff --git a/ROCm_Tools/clFFT.rst b/ROCm_Tools/clFFT.rst
index df127b81..8159855c 100644
--- a/ROCm_Tools/clFFT.rst
+++ b/ROCm_Tools/clFFT.rst
@@ -164,7 +164,7 @@ Build dependencies
 
 To develop the clFFT library code on a Windows operating system, ensure to install the following packages on your system:
 
- * Windows® 7/8.1
+ * Windows(R) 7/8.1
 
  * Visual Studio 2012 or later
 
@@ -193,7 +193,7 @@ To test the developed clFFT library code, ensure to install the following packag
  * Googletest v1.6
 
  * Latest FFTW
- 
+
  * Latest Boost
 
 Performance infrastructure
diff --git a/ROCm_Tools/clRNG.rst b/ROCm_Tools/clRNG.rst
index aa450aa5..aa2e5781 100644
--- a/ROCm_Tools/clRNG.rst
+++ b/ROCm_Tools/clRNG.rst
@@ -3,12 +3,12 @@
 =========
 clRNG
 =========
- 
+
 For Github repository `clRNG <https://github.com/clMathLibraries/clRNG>`_
 
 A library for uniform random number generation in OpenCL.
 
-Streams of random numbers act as virtual random number generators. They can be created on the host computer in unlimited numbers, and then used either on the host or on computing devices by work items to generate random numbers. Each stream also has equally-spaced substreams, which are occasionally useful. The API is currently implemented for four different RNGs, namely the MRG31k3p, MRG32k3a, LFSR113 and Philox-4×32-10 generators.
+Streams of random numbers act as virtual random number generators. They can be created on the host computer in unlimited numbers, and then used either on the host or on computing devices by work items to generate random numbers. Each stream also has equally-spaced substreams, which are occasionally useful. The API is currently implemented for four different RNGs, namely the MRG31k3p, MRG32k3a, LFSR113 and Philox-4x32-10 generators.
 
 Documentation
 ***************
@@ -31,7 +31,7 @@ Building
 **********
  1. Install the runtime dependency:
       * An OpenCL SDK, such as APP SDK.
- 
+
  2. Install the build dependencies:
 
      * The CMake cross-platform build system. Visual Studio users can use CMake Tools for Visual Studio.
@@ -64,7 +64,7 @@ On a 64-bit Linux platform, steps 3 through 9 from above, executed in a Bash-com
   export CLRNG_ROOT=$PWD/package
   export LD_LIBRARY_PATH=$CLRNG_ROOT/lib64:$LD_LIBRARY_PATH
   $CLRNG_ROOT/bin/CTest
-  
+
 Examples
 ***********
 Examples can be found in src/client. The compiled client program examples can be found under the bin subdirectory of the installation package ($CLRNG_ROOT/bin under Linux). Note that the examples expect an OpenCL GPU device to be available.
diff --git a/ROCm_Tools/clSPARSE.rst b/ROCm_Tools/clSPARSE.rst
index d5414078..0e6a80c7 100644
--- a/ROCm_Tools/clSPARSE.rst
+++ b/ROCm_Tools/clSPARSE.rst
@@ -5,10 +5,10 @@
 ===========
 clSPARSE
 ===========
- 
+
 For Github repository `clSPARSE <https://github.com/clMathLibraries/clSPARSE>`_
 
-an OpenCL™ library implementing Sparse linear algebra routines. This project is a result of a collaboration between `AMD Inc. <http://www.amd.com/en>`_ and `Vratis Ltd. <http://www.vratis.com/>`_.
+an OpenCL(TM) library implementing Sparse linear algebra routines. This project is a result of a collaboration between `AMD Inc. <http://www.amd.com/en>`_ and `Vratis Ltd. <http://www.vratis.com/>`_.
 
 What's new in clSPARSE v0.10.1
 ******************************
@@ -30,7 +30,7 @@ clSPARSE features
  * Dense to CSR conversions (& converse)
  * COO to CSR conversions (& converse)
  * Functions to read matrix market files in COO or CSR format
-True in spirit with the other clMath libraries, clSPARSE exports a “C” interface to allow projects to build wrappers around clSPARSE in any language they need. A great deal of thought and effort went into designing the API’s to make them less ‘cluttered’ compared to the older clMath libraries. OpenCL state is not explicitly passed through the API, which enables the library to be forward compatible when users are ready to switch from OpenCL 1.2 to OpenCL 2.0 3
+True in spirit with the other clMath libraries, clSPARSE exports a "C" interface to allow projects to build wrappers around clSPARSE in any language they need. A great deal of thought and effort went into designing the API's to make them less 'cluttered' compared to the older clMath libraries. OpenCL state is not explicitly passed through the API, which enables the library to be forward compatible when users are ready to switch from OpenCL 1.2 to OpenCL 2.0 3
 
 Google Groups
 ***************
@@ -67,7 +67,7 @@ clSPARSE is licensed under the `Apache License <http://www.apache.org/licenses/L
 
 Compiling for Windows
 ***********************
- * Windows® 7/8
+ * Windows(R) 7/8
  * Visual Studio 2013 and above
  * CMake 2.8.12 (download from `Kitware <http://www.cmake.org/download/>`_)
  * Solution (.sln) or
diff --git a/ROCm_Tools/hcFFT.rst b/ROCm_Tools/hcFFT.rst
index 2f58473e..627720fb 100644
--- a/ROCm_Tools/hcFFT.rst
+++ b/ROCm_Tools/hcFFT.rst
@@ -31,7 +31,7 @@ file: hcfft_1D_R2C.cpp
 ::
 
   #!c++
-  
+
   #include <iostream>
   #include <cstdlib>
   #include "hcfft.h"
@@ -73,9 +73,9 @@ file: hcfft_1D_R2C.cpp
     free(input);
     free(output);
     hc::am_free(idata);
-    hc::am_free(odata); 
+    hc::am_free(odata);
   }
- 
+
 * Compiling the example code:
 
 Assuming the library and compiler installation is followed as in installation.
@@ -94,7 +94,7 @@ The following are the steps to use the library
 
 ROCM 1.5 Installation
 ***********************
-To Know more about ROCM refer 
+To Know more about ROCM refer
 https://github.com/RadeonOpenCompute/ROCm/blob/master/README.md
 
 **a. Installing Debian ROCM repositories**
@@ -130,8 +130,8 @@ and Reboot the system
 
 Once Reboot, to verify that the ROCm stack completed successfully you can execute HSA vector_copy sample application:
 
-   * cd /opt/rocm/hsa/sample        
-   * make       
+   * cd /opt/rocm/hsa/sample
+   * make
    * ./vector_copy
 
 Library Installation
@@ -178,7 +178,7 @@ The following are the sub-routines that are implemented
 
 KeyFeature
 ############
- 
+
  * Support 1D, 2D and 3D Fast Fourier Transforms
  * Supports R2C, C2R, C2C, D2Z, Z2D and Z2Z Transforms
  * Support Out-Of-Place data storage
@@ -195,7 +195,7 @@ This section lists the known set of hardware and software requirements to build
 Hardware
 *********
 
- * CPU: mainstream brand, Better if with >=4 Cores Intel Haswell based CPU 
+ * CPU: mainstream brand, Better if with >=4 Cores Intel Haswell based CPU
  * System Memory >= 4GB (Better if >10GB for NN application over multiple GPUs)
  * Hard Drive > 200GB (Better if SSD or NVMe driver for NN application over multiple GPUs)
  * Minimum GPU Memory (Global) > 2GB
@@ -250,9 +250,9 @@ Driver versions
 
 GPU Cards
 ***********
- * Radeon R9 Nano 
+ * Radeon R9 Nano
  * Radeon R9 FuryX
- * Radeon R9 Fury 
+ * Radeon R9 Fury
  * Kaveri and Carizo APU
 
 Server System
diff --git a/ROCm_Tools/hcRNG.rst b/ROCm_Tools/hcRNG.rst
index ed6bd664..92be28c3 100644
--- a/ROCm_Tools/hcRNG.rst
+++ b/ROCm_Tools/hcRNG.rst
@@ -21,7 +21,7 @@ file: Randomarray.cpp
 
 ::
 
- 
+
   //This example is a simple random array generation and it compares host output with device output
   //Random number generator Mrg31k3p
   #include <stdio.h>
@@ -34,7 +34,7 @@ file: Randomarray.cpp
   #include <hc.hpp>
   #include <hc_am.hpp>
   using namespace hc;
- 
+
   int main()
   {
         hcrngStatus status = HCRNG_SUCCESS;
@@ -44,7 +44,7 @@ file: Randomarray.cpp
         size_t streamCount = 10;
         //Number of random numbers to be generated
         //numberCount must be a multiple of streamCount
-        size_t numberCount = 100; 
+        size_t numberCount = 100;
         //Enumerate the list of accelerators
         std::vector<hc::accelerator>acc = hc::accelerator::get_all();
         accelerator_view accl_view = (acc[1].create_view());
@@ -52,21 +52,21 @@ file: Randomarray.cpp
         float *Random1 = (float*) malloc(sizeof(float) * numberCount);
         float *Random2 = (float*) malloc(sizeof(float) * numberCount);
         float *outBufferDevice = hc::am_alloc(sizeof(float) * numberCount, acc[1], 0);
- 
+
         //Create streams
         hcrngMrg31k3pStream *streams = hcrngMrg31k3pCreateStreams(NULL, streamCount, &streamBufferSize, NULL);
         hcrngMrg31k3pStream *streams_buffer = hc::am_alloc(sizeof(hcrngMrg31k3pStream) * streamCount, acc[1], 0);
         accl_view.copy(streams, streams_buffer, streamCount* sizeof(hcrngMrg31k3pStream));
- 
-        //Invoke random number generators in device (here strean_length and streams_per_thread arguments are default) 
+
+        //Invoke random number generators in device (here strean_length and streams_per_thread arguments are default)
         status = hcrngMrg31k3pDeviceRandomU01Array_single(accl_view, streamCount, streams_buffer, numberCount, outBufferDevice);
- 
+
         if(status) std::cout << "TEST FAILED" << std::endl;
         accl_view.copy(outBufferDevice, Random1, numberCount * sizeof(float));
- 
+
         //Invoke random number generators in host
         for (size_t i = 0; i < numberCount; i++)
-          Random2[i] = hcrngMrg31k3pRandomU01(&streams[i % streamCount]);   
+          Random2[i] = hcrngMrg31k3pRandomU01(&streams[i % streamCount]);
         // Compare host and device outputs
         for(int i =0; i < numberCount; i++) {
             if (Random1[i] != Random2[i]) {
@@ -78,7 +78,7 @@ file: Randomarray.cpp
                 continue;
         }
         if(!ispassed) std::cout << "TEST FAILED" << std::endl;
- 
+
         //Free host resources
         free(Random1);
         free(Random2);
@@ -86,8 +86,8 @@ file: Randomarray.cpp
         hc::am_free(outBufferDevice);
         hc::am_free(streams_buffer);
         return 0;
-  }  
- 
+  }
+
 
 * Compiling the example code:
 
@@ -132,8 +132,8 @@ and **Reboot the system**
 
 Once Reboot, to verify that the ROCm stack completed successfully you can execute HSA vector_copy sample application:
 ::
-  cd /opt/rocm/hsa/sample        
-  make       
+  cd /opt/rocm/hsa/sample
+  make
   ./vector_copy
 
 Library Installation
@@ -141,14 +141,14 @@ Library Installation
 **a. Install using Prebuilt debian**
 
 ::
-  
+
   wget https://github.com/ROCmSoftwarePlatform/hcRNG/blob/master/pre-builds/hcrng-master-184472e-Linux.deb
   sudo dpkg -i hcrng-master-184472e-Linux.deb
 
 **b. Build debian from source**
 
 ::
-  
+
   git clone https://github.com/ROCmSoftwarePlatform/hcRNG.git && cd hcRNG
   chmod +x build.sh && ./build.sh
 
diff --git a/ROCm_Tools/hipBLAS.rst b/ROCm_Tools/hipBLAS.rst
index c99be8ca..69e5edc9 100644
--- a/ROCm_Tools/hipBLAS.rst
+++ b/ROCm_Tools/hipBLAS.rst
@@ -54,7 +54,7 @@ Batched and strided GEMM API
 *****************************
 hipBLAS GEMM can process matrices in batches with regular strides. There are several permutations of these API's, the following is an example that takes everything
 
-:: 
+::
 
   hipblasStatus_t
   hipblasSgemmStridedBatched( hipblasHandle_t handle,
diff --git a/ROCm_Tools/hipeigen.rst b/ROCm_Tools/hipeigen.rst
index 50ea9c86..e56bc17c 100644
--- a/ROCm_Tools/hipeigen.rst
+++ b/ROCm_Tools/hipeigen.rst
@@ -26,7 +26,7 @@ AMD is hosting both debian and rpm repositories for the ROCm 2.4 packages. The p
 
 Complete installation steps of ROCm can be found `Here <https://rocm-documentation.readthedocs.io/en/latest/Installation_Guide/Installation-Guide.html>`_
 
-or 
+or
 
 For Debian based systems, like Ubuntu, configure the Debian ROCm repository as follows:
 
@@ -52,7 +52,7 @@ Next, update the apt-get repository list and install/update the rocm package:
 
 Then, make the ROCm kernel your default kernel. If using grub2 as your bootloader, you can edit the GRUB_DEFAULT variable in the following file:
 
-:: 
+::
   sudo vi /etc/default/grub
   sudo update-grub
 
diff --git a/ROCm_Tools/hipinstall.rst b/ROCm_Tools/hipinstall.rst
index ea873d27..9f57e0ee 100644
--- a/ROCm_Tools/hipinstall.rst
+++ b/ROCm_Tools/hipinstall.rst
@@ -18,7 +18,7 @@ AMD-hcc
 
  * Default paths and environment variables:
 
-     * By default HIP looks for hcc in /opt/rocm/hcc (can be overridden by setting HCC_HOME environment variable) 
+     * By default HIP looks for hcc in /opt/rocm/hcc (can be overridden by setting HCC_HOME environment variable)
      * By default HIP looks for HSA in /opt/rocm/hsa (can be overridden by setting HSA_PATH environment variable)
      * By default HIP is installed into /opt/rocm/hip (can be overridden by setting HIP_PATH environment variable).
      * Optionally, consider adding /opt/rocm/bin to your PATH to make it easier to use the tools.
@@ -28,7 +28,7 @@ NVIDIA-nvcc
  * Configure the additional package server as described `here <https://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/>`_.
  * Install the "hip_nvcc" package. This will install CUDA SDK and the HIP porting layer.
 
-:: 
+::
 
   apt-get install hip_nvcc
 
@@ -60,13 +60,13 @@ HIP source code is available and the project can be built from source on the HCC
   cd HIP
   mkdir build
   cd build
-  cmake .. 
+  cmake ..
   make
   make install
 
 * Default paths:
    * By default cmake looks for hcc in /opt/rocm/hcc (can be overridden by setting -DHCC_HOME=/path/to/hcc in the cmake step).*
-   * By default cmake looks for HSA in /opt/rocm/hsa (can be overridden by setting -DHSA_PATH=/path/to/hsa in the cmake step).* 
+   * By default cmake looks for HSA in /opt/rocm/hsa (can be overridden by setting -DHSA_PATH=/path/to/hsa in the cmake step).*
    * By default cmake installs HIP to /opt/rocm/hip (can be overridden by setting -DCMAKE_INSTALL_PREFIX=/where/to/install/hip in the 	   cmake step).*
 
 Here's a richer command-line that overrides the default paths:
diff --git a/ROCm_Tools/rocFFT.rst b/ROCm_Tools/rocFFT.rst
index 7e79d871..ce4ff230 100644
--- a/ROCm_Tools/rocFFT.rst
+++ b/ROCm_Tools/rocFFT.rst
@@ -82,7 +82,7 @@ The following is a simple example code that shows how to use rocFFT to compute a
           // Copy result back to host
           std::vector<float2> y(N);
           hipMemcpy(y.data(), x, Nbytes, hipMemcpyDeviceToHost);
- 
+
           // Print results
           for (size_t i = 0; i < N; i++)
           {
diff --git a/ROCm_Tools/rocFFTAPI.rst b/ROCm_Tools/rocFFTAPI.rst
index d0a1928e..22c8548c 100644
--- a/ROCm_Tools/rocFFTAPI.rst
+++ b/ROCm_Tools/rocFFTAPI.rst
@@ -119,7 +119,7 @@ Documentation is TBD.
 	rocfft_transform_type_complex_forward,
 	rocfft_transform_type_complex_inverse,
 	rocfft_transform_type_real_forward,
-	rocfft_transform_type_real_inverse,	
+	rocfft_transform_type_real_inverse,
   } rocfft_transform_type;
 
   // Precision
@@ -136,14 +136,14 @@ Documentation is TBD.
 	rocfft_element_type_complex_double,
 	rocfft_element_type_single,
 	rocfft_element_type_double,
-	rocfft_element_type_byte,	
+	rocfft_element_type_byte,
   } rocfft_element_type;
 
   // Result placement
   typedef enum rocfft_result_placement_e
   {
 	rocfft_placement_inplace,
-	rocfft_placement_notinplace,	
+	rocfft_placement_notinplace,
   } rocfft_result_placement;
 
   // Array type
@@ -153,7 +153,7 @@ Documentation is TBD.
 	rocfft_array_type_complex_planar,
 	rocfft_array_type_real,
 	rocfft_array_type_hermitian_interleaved,
-	rocfft_array_type_hermitian_planar,	
+	rocfft_array_type_hermitian_planar,
   } rocfft_array_type;
 
   // Execution mode
@@ -178,7 +178,7 @@ To give an idea of how the library API is intended to be used, the following seq
   status = rocfft_plan_description_create(&description);
   status = rocfft_plan_description_set_data_layout(&description, ...);
 
-  // create plan 
+  // create plan
   status = rocfft_plan_create(&plan, ..., &description);
 
   // create execution_info as needed
diff --git a/ROCm_Tools/rocblaswiki.rst b/ROCm_Tools/rocblaswiki.rst
index d49e3eb1..b13ea6e9 100644
--- a/ROCm_Tools/rocblaswiki.rst
+++ b/ROCm_Tools/rocblaswiki.rst
@@ -1,7 +1,7 @@
 .. _rocblaswiki:
 
 ========================
-rocblas build wiki 
+rocblas build wiki
 ========================
 
 Home
@@ -12,7 +12,7 @@ Building rocBLAS
  1. For instructions to build rocblas library and clients, see Build rocBLAS libraries and verification code.
  2. For an example using rocBLAS see Example C code calling rocBLAS function.
  3. For instructions on how to run/use the client code, see Build rocBLAS libraries, verification-code, tests and benchmarks.
-    
+
 Functionality
 ***************
 rocBLAS exports the following BLAS-like functions at this time.
@@ -36,12 +36,12 @@ Rules for obtaining the rocBLAS API from Legacy BLAS
  * Where Legacy BLAS functions have return values, the return value is instead added as the last function argument. It is returned by 	reference on either the host or the device. The rocBLAS functions will check to see it the value is on the device. If this is true, 	it is used, else the value is returned on the host. This applies to the following functions: xDOT, xDOTU, xNRM2, xASUM, IxAMAX,     	IxAMIN.
 
  7. The return value of all functions is rocblas_status, defined in rocblas_types.h. It is used to check for errors.
-    
+
 Additional notes
 ******************
  * The rocBLAS library is LP64, so rocblas_int arguments are 32 bit and rocblas_long arguments are 64 bit.
 
- * rocBLAS uses column-major storage for 2D arrays, and 1 based indexing for the functions xMAX and xMIN. This is the same as Legacy 	BLAS and cuBLAS. If you need row-major and 0 based indexing (used in C language arrays) download the `CBLAS <http://www.netlib.org/blas/#_cblas>`_ file cblas.tgz. Look at 	 the CBLAS functions that provide a thin interface to Legacy BLAS. They convert from 	row-major, 0 based, to column-major, 1 based. 
+ * rocBLAS uses column-major storage for 2D arrays, and 1 based indexing for the functions xMAX and xMIN. This is the same as Legacy 	BLAS and cuBLAS. If you need row-major and 0 based indexing (used in C language arrays) download the `CBLAS <http://www.netlib.org/blas/#_cblas>`_ file cblas.tgz. Look at 	 the CBLAS functions that provide a thin interface to Legacy BLAS. They convert from 	row-major, 0 based, to column-major, 1 based.
    This is done by swapping the order of function arguments. It is not necessary to transpose matrices.
 
  * The auxiliary functions rocblas_set_pointer and rocblas_get_pointer are used to set and get the value of the state variable 	     	rocblas_pointer_mode. This variable is not used, it is added for compatibility with cuBLAS. rocBLAS will check if your scalar     	argument passed by reference is on the device. If this is true it will pass by reference on the device, else it passes by         	reference on the host.
@@ -93,7 +93,7 @@ rocblas-test 	        runs Google Tests to test the library
 rocblas-bench 	        executable to benchmark or test individual functions
 example-sscal 	        example C code calling rocblas_sscal function
 ================        ===========
-	
+
 Common uses of install.sh to build (dependencies + library + client) are in the table below.
 
 ===================     ============
@@ -159,7 +159,7 @@ Build (library dependencies + client dependencies + library + client) using Indi
 The unit tests and benchmarking applications in the client introduce the following dependencies:
 
 #. `boost <https://www.boost.org/>`_
-#. `fortran <https://gcc.gnu.org/wiki/GFortran>`_ 
+#. `fortran <https://gcc.gnu.org/wiki/GFortran>`_
 #. `lapack <https://github.com/Reference-LAPACK/lapack-release>`_
          * lapack itself brings a dependency on a fortran compiler
 #.  `googletest <https://github.com/google/googletest>`_
@@ -290,7 +290,7 @@ Example
           hx[i] = rand() % 10 + 1;  //generate a integer number between [1, 10]
           }
 
-          // save a copy in hz 
+          // save a copy in hz
           hz = hx;
 
           hipMemcpy(dx, hx.data(), sizeof(float) * N, hipMemcpyHostToDevice);
@@ -370,7 +370,7 @@ Run the executable with the command
 	$(CPP) -c -o $@ $< $(CFLAGS)
 
   $(EXE) : $(OBJ)
-	$(LD) $(OBJ) $(LDFLAGS) -o $@ 
+	$(LD) $(OBJ) $(LDFLAGS) -o $@
 
   clean:
 	rm -f $(EXE) $(OBJ)
@@ -646,7 +646,7 @@ Train Tensile for rocBLAS
 Below are 10 steps that can be used to build Tensile and rocBLAS for the sizes specified in rocblas_sgemm_asm_miopen.yaml
 
 ::
-  
+
    git clone -b develop https://github.com/ROCmSoftwarePlatform/Tensile.git
    cd Tensile
    mkdir build
diff --git a/ROCm_Tools/rocm-debug.rst b/ROCm_Tools/rocm-debug.rst
index 8d734511..1f76a909 100644
--- a/ROCm_Tools/rocm-debug.rst
+++ b/ROCm_Tools/rocm-debug.rst
@@ -21,10 +21,10 @@ Build Steps
 ************
 
 1.Install ROCm using the instruction `here <http://rocm-documentation.readthedocs.io/en/latest/Installation_Guide/Installation-Guide.html#installation-guide-ubuntu>`_
-    
+
 2.Clone the Debug SDK repository
 
-:: 
+::
     git clone https://github.com/RadeonOpenCompute/ROCm-GPUDebugSDK.git
 
 3. Build the AMD HSA Debug Agent Library and the Matrix multiplication examples by calling make in the src/HSADebugAgent and the samples/MatrixMultiplication directories respectively
@@ -32,16 +32,16 @@ Build Steps
 ::
     cd src/HSADebugAgent
     make
-  
+
 * Note that matrixMul_kernel.hsail is included for reference only. This sample will load the pre-built hsa binary (matrixMul_kernel.brig) to run the kernel.
-   
-   
+
+
 ::
-  
+
    cd samples/MatrixMultiplication
-  
+
 ::
- 
+
     make
 
 4. Build the Debug Facilities library by calling make in the src/HwDbgFacilities directory
diff --git a/ROCm_Tools/tensile.rst b/ROCm_Tools/tensile.rst
index 04f4a75e..a697a77b 100644
--- a/ROCm_Tools/tensile.rst
+++ b/ROCm_Tools/tensile.rst
@@ -63,7 +63,7 @@ Benchmark Config
 
 Example Benchmark config.yaml
 
-:: 
+::
 
   GlobalParameters:
     PrintLevel: 1
@@ -259,18 +259,18 @@ Each step of the benchmark can override what problem sizes will be benchmarked.
 
  1.[1968]
   * Benchmark only size 1968; n = 1.
-  
+
  2.[16, 1920]
   * Benchmark sizes 16 to 1968 using the default step size (=16); n = 123.
- 
+
  3.[16, 32, 1968]
   * Benchmark sizes 16 to 1968 using a step size of 32; n = 61.
- 
+
  4.[64, 32, 16, 1968]
   * Benchmark sizes from 64 to 1968 with a step size of 32. Also, increase the step size by 16 each iteration.
   * This causes fewer sizes to be benchmarked when the sizes are large, and more benchmarks where the sizes are small; this is 	      	typically desired behavior.
   * n = 16 (64, 96, 144, 208, 288, 384, 496, 624, 768, 928, 1104, 1296, 1504, 1728, 1968). The stride at the beginning is 32, but     	the stride at the end is 256.
- 
+
  5.[0]
   * The size of this index is just whatever size index 0 is. For a 3-dimensional ProblemType, this allows benchmarking only a 2- 	      	dimensional or 1-dimensional slice of problem sizes.
 
@@ -372,11 +372,11 @@ Tensile can be installed via:
 
 ::
 
-   git clone https://github.com/RadeonOpenCompute/Tensile.git 
+   git clone https://github.com/RadeonOpenCompute/Tensile.git
    python Tensile/Tensile/Tensile.py config.yaml benchmark_path
 
 
-.. _KernelParameters: 
+.. _KernelParameters:
 
 Kernel Parameters
 ###################
@@ -411,7 +411,7 @@ The kernel parameters affect many aspects of performance. Changing a parameter m
 
  .. image:: img1.png
      :align: center
-   
+
 How N-Dimensional Tensor Contractions Are Mapped to Finite-Dimensional GPU Kernels
 ************************************************************************************
 For a traditional GEMM, the 2-dimensional output, C[i,j], is mapped to launching a 2-dimensional grid of work groups, each of which has a 2-dimensional grid of work items; one dimension belongs to i and one dimension belongs to j. The 1-dimensional summation is represented by a single loop within the kernel body.
@@ -448,7 +448,7 @@ The device languages Tensile supports for the gpu kernels is
 * OpenCL 1.2
 * HIP
 * Assembly
-   * gfx803 
+   * gfx803
    * gfx900
 
   .. _LibraryLogic:
diff --git a/ROCm_Tools/tutorial.rst b/ROCm_Tools/tutorial.rst
index 15c2e053..cbb0e632 100644
--- a/ROCm_Tools/tutorial.rst
+++ b/ROCm_Tools/tutorial.rst
@@ -2,7 +2,7 @@
 
 tutorial
 ==========
- 
+
 How do I debug my GPU application?
 ************************************
 You can start your program in rocm-gdb just like you would any application under gdb
@@ -190,7 +190,7 @@ Switching the focus to another work-item and printing $s0 allows us to view data
 ::
 
    (ROCm-gdb) rocm thread wg:0,0,0 wi:1,0,0
-   [ROCm-gdb]: Switching to work-group (0,0,0) and work-item (1,0,0)  
+   [ROCm-gdb]: Switching to work-group (0,0,0) and work-item (1,0,0)
    (ROCm-gdb) print rocm:$s0
     $3 = 1
 
@@ -299,7 +299,7 @@ The info rocm work-groups command will show the active work-groups for the activ
 
 The info rocm wg 0 command will show the information of work-group 0 for the active dispatch
 
-:: 
+::
 
   Information for Work-group 0
   Index     Wave ID {SE,SH,CU,SIMD,Wave}            Work-item ID        Abs Work-item ID        PC            Source line
@@ -337,11 +337,11 @@ ROCm-gdb helps developers to view information about kernels that have been launc
   (ROCm-gdb) set rocm trace mytrace.csv
   (ROCm-gdb) set rocm trace on
 
-You can now execute and debug the application within ROCm-gdb. Anytime during the application’s execution you can view my_trace.csv to see the kernels have been dispatched. A sample trace for an application that dispatches a vector add kernel followed by a matrix multiplication kernel in a loop is shown below.
-		   		&__OpenCL_matrixMul_kernel 	
-====== =========== =========== ============================= ======= ======= ================ =========== ========== ====================== 
-index 	queue_id    packet_id 	  kernel_name 	              header  setup   workgroup_size   reserved0  grid_size   private_segment_size 
-====== =========== =========== ============================= ======= ======= ================ =========== ========== ====================== 
+You can now execute and debug the application within ROCm-gdb. Anytime during the application's execution you can view my_trace.csv to see the kernels have been dispatched. A sample trace for an application that dispatches a vector add kernel followed by a matrix multiplication kernel in a loop is shown below.
+		   		&__OpenCL_matrixMul_kernel
+====== =========== =========== ============================= ======= ======= ================ =========== ========== ======================
+index 	queue_id    packet_id 	  kernel_name 	              header  setup   workgroup_size   reserved0  grid_size   private_segment_size
+====== =========== =========== ============================= ======= ======= ================ =========== ========== ======================
 	group_segment_size 	kernel_object 	kernarg_address 	reserved2 	completion_signal
 0 	380095252 	0 	&__Gdt_vectoradd_kernel 	5122 	1 	{64 1 1} 	0 	{64 1 1} 	0 	0 	140737353981952 	0x713000 	0 	7513216
 1 	380095252 	1 	&__OpenCL_matrixMul_kernel 	5122 	2 	{16 16 1} 	0 	{128 80 1} 	0 	0 	140737353983488 	0x6ca000 	0 	7910848
diff --git a/ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.rst b/ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.rst
index 282c37ea..0013b593 100644
--- a/ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.rst
+++ b/ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.rst
@@ -9,21 +9,21 @@ PCIe Passthrough on KVM
 ================
 The following KVM-based instructions assume a headless host with an input/output memory management unit (IOMMU) to pass peripheral devices such as a GPU to guest virtual machines.  If you know your host supports IOMMU but the below command does not find "svm" or "vxm", you may need to enable IOMMU in your BIOS.
 
-::	
- 
-   cat /proc/cpuinfo | grep -E “svm|vxm”
+::
+
+   cat /proc/cpuinfo | grep -E "svm|vxm"
 
 Ubuntu 16.04
 ****************************
 Assume we use an intel system that support VT-d , with fresh ubuntu 16.04 installed
- 
+
 **a. Install necessary packages and prepare for pass through device**
 
-1. ::	
- 
+1. ::
+
    sudo apt-get install qemu-kvm qemu-system bridge-utils virt-manager ubuntu-vm-builder libvirt-dev
 
-	
+
 2. add following modules into /etc/modules
        | vfio
        | vfio_iommu_type1
@@ -31,15 +31,15 @@ Assume we use an intel system that support VT-d , with fresh ubuntu 16.04 instal
        | kvm
        | kvm_intel
 
-    add intel_iommu=on in /etc/default/grub 
+    add intel_iommu=on in /etc/default/grub
  	| GRUB_CMDLINE_LINUX_DEFAULT="quiet splash intel_iommu=on"
-    ::	
- 
+    ::
+
    sudo update-grub
 
 3. Blacklist amdgpu by adding the following line to /etc/modprobe.d/blacklist.conf
-    ::	
- 
+    ::
+
    blacklist amdgpu
 **b. Bind pass through device to vfio-pci**
 
@@ -60,8 +60,8 @@ Assume we use an intel system that support VT-d , with fresh ubuntu 16.04 instal
 
 2. Make it executable by enter the command
 
-::	
- 
+::
+
    chmod 755 vfio-bind
 
 3. Bind the device to vfio by running the command for the three pass through devices
@@ -76,17 +76,17 @@ Assume we use an intel system that support VT-d , with fresh ubuntu 16.04 instal
 
 **c. Pass through device to guest VM**
 
-1. Start VMM by running “virt-manager” as root. Follow the on screen instruction to create one virtual machine(VM), make sure CPU    	copy host CPU configuration, network use bridge mode. 
+1. Start VMM by running "virt-manager" as root. Follow the on screen instruction to create one virtual machine(VM), make sure CPU    	copy host CPU configuration, network use bridge mode.
 2. Add Hardware --> Select PCI Host device, select the appropriate device to pass through. ex:0000:83:00.0
 3. sudo setpci -s 83:00.0 CAP_EXP+28.l=40
 4. sudo reboot
 
-After reboot, start virt-manager and then start the VM, inside the VM , lspci -d 1002: should shows the pass throughed device.   
+After reboot, start virt-manager and then start the VM, inside the VM , lspci -d 1002: should shows the pass throughed device.
 
 Fedora 27 or CentOS 7 (1708)
 ****************************
 From a fresh install of Fedora 27 or CentOS 7 (1708)
- 
+
 **a. Install necessary packages and prepare for pass through device**
 
 1. Identity the vendor and device id(s) for the PCIe device(s) you wish to passthrough, e.g., 1002:6861 and 1002:aaf8 for an AMD Radeon Pro WX 9100 and its associated audio device,
@@ -111,7 +111,7 @@ From a fresh install of Fedora 27 or CentOS 7 (1708)
     echo "options vfio-pci ids=1002:6861,1002:aaf8" | sudo tee -a /etc/modprobe.d/vfio.conf
     echo "options vfio-pci disable_vga=1" | sudo tee -a /etc/modprobe.d/vfio.conf
     sed 's/quiet/quiet rd.driver.pre=vfio-pci video=efifb:off/' /etc/sysconfig/grub
-    
+
 5. Update the kernel boot settings
 
 ::
@@ -140,7 +140,7 @@ Note: To pass a device within a particular IOMMU group, all devices within that
 	lspci -nns "${d##*/}"
     done;
 
-		
+
 ROCm-Docker
 ===========
 
@@ -152,7 +152,7 @@ This repository contains a framework for building the software layers defined in
 
  * Docker on `Ubuntu <https://docs.docker.com/v2.0/installation/ubuntulinux/>`_ systems or `Fedora systems <https://docs.docker.com/v2.0/installation/fedora/>`_
  * Highly recommended: `Docker-Compose <https://docs.docker.com/compose/install/>`_ to simplify container management
-   
+
 Docker Hub
 **********
 Looking for an easy start with ROCm + Docker? The rocm/rocm-terminal image is hosted on `Docker Hub <https://hub.docker.com/r/rocm/rocm-terminal/>`_ . After the `ROCm kernel is installed <https://rocm-documentation.readthedocs.io/en/latest/Installation_Guide/ROCK-Kernel-Driver_readme.html#opencomute-kernel-deriver>`_ , pull the image from Docker Hub and create a new instance of a container.
@@ -161,8 +161,8 @@ Looking for an easy start with ROCm + Docker? The rocm/rocm-terminal image is ho
 
   sudo docker pull rocm/rocm-terminal
   sudo docker run -it --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video rocm/rocm-terminal
-  
-  
+
+
 ROCm-docker set up guide
 *************************
 `Installation instructions <https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/quick-start.md>`_ and asciicasts demos are available to help users quickly get running with rocm-docker. Visit the set up guide to read more.
@@ -265,7 +265,7 @@ The dockerfile that serves as a 'terminal' creates a non-root user called **rocm
 To increase container security:
 
  1.Eliminate the sudo-nopasswd COPY statement in the dockerfile and replace with
- 
+
  2.Your own password with RUN echo 'account:password' | chpasswd
 
 The docker.ce release 18.02 has known defects working with rocm-user account insider docker image. Please upgrade docker package to the `18.04 build <https://download.docker.com/linux/ubuntu/dists/xenial/pool/nightly/amd64/docker-ce_18.04.0~ce~dev~git20180313.171447.0.6e4307b-0~ubuntu_amd64.deb>`_.
diff --git a/ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.rst~ b/ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.rst~
deleted file mode 100644
index b86d0cad..00000000
--- a/ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers.rst~
+++ /dev/null
@@ -1,262 +0,0 @@
-
-.. _ROCm-Virtualization-&-Containers:
-
-=================================
-ROCm Virtualization & Containers
-=================================
-
-PCIe Passthrough on KVM
-================
-The following KVM-based instructions assume a headless host with an input/output memory management unit (IOMMU) to pass peripheral devices such as a GPU to guest virtual machines.  If you know your host supports IOMMU but the below command does not find "svm" or "vxm", you may need to enable IOMMU in your BIOS.
-
-::	
- 
-   cat /proc/cpuinfo | grep -E “svm|vxm”
-
-Ubuntu 16.04
-****************************
-Assume we use an intel system that support VT-d , with fresh ubuntu 16.04 installed
- 
-**a. Install necessary packages and prepare for pass through device**
-
-1. ::	
- 
-   sudo apt-get install qemu-kvm qemu-system bridge-utils virt-manager ubuntu-vm-builder libvirt-dev
-
-	
-2. add following modules into /etc/modules
-       | vfio
-       | vfio_iommu_type1
-       | vfio_pci
-       | kvm
-       | kvm_intel
-
-    add intel_iommu=on in /etc/default/grub 
- 	| GRUB_CMDLINE_LINUX_DEFAULT="quiet splash intel_iommu=on"
-    ::	
- 
-   sudo update-grub
-
-3. Blacklist amdgpu by adding the following line to /etc/modprobe.d/blacklist.conf
-    ::	
- 
-   blacklist amdgpu
-**b. Bind pass through device to vfio-pci**
-
-1. Create a script file (vfio-bind) under /usr/bin. The script file has the following content:
-
-::
-
-	#!/bin/bash
-	modprobe vfio-pci
-	for dev in "$@"; do
-	        vendor=$(cat /sys/bus/pci/devices/$dev/vendor)
-	        device=$(cat /sys/bus/pci/devices/$dev/device)
-	        if [ -e /sys/bus/pci/devices/$dev/driver ]; then
-	                echo $dev > /sys/bus/pci/devices/$dev/driver/unbind
-	        fi
-	        echo $vendor $device > /sys/bus/pci/drivers/vfio-pci/new_id
-	done
-
-2. Make it executable by enter the command
-
-::	
- 
-   chmod 755 vfio-bind
-
-3. Bind the device to vfio by running the command for the three pass through devices
-
-::
-
-	lspci -n -d 1002:
-		83:00.0 0300: 1002:7300 (rev ca)
-	vfio.bind 0000:83:00.0
-
-4. sudo reboot
-
-**c. Pass through device to guest VM**
-
-1. Start VMM by running “virt-manager” as root. Follow the on screen instruction to create one virtual machine(VM), make sure CPU    	copy host CPU configuration, network use bridge mode. 
-2. Add Hardware --> Select PCI Host device, select the appropriate device to pass through. ex:0000:83:00.0
-3. sudo setpci -s 83:00.0 CAP_EXP+28.l=40
-4. sudo reboot
-
-After reboot, start virt-manager and then start the VM, inside the VM , lspci -d 1002: should shows the pass throughed device.   
-
-Fedora 27 or CentOS 7 (1708)
-****************************
-From a fresh install of Fedora 27 or CentOS 7 (1708)
- 
-**a. Install necessary packages and prepare for pass through device**
-
-1. Identity the vendor and device id(s) for the PCIe device(s) you wish to passthrough, e.g., 1002:6861 and 1002:aaf8 for an AMD Radeon Pro WX 9100 and its associated audio device,
-    lspci -nnk
-
-2. Install virtualization packages
-    sudo dnf install @virtualization
-    sudo usermod -G libvirt -a $(whoami)
-    sudo usermod -G kvm -a $(whoami)
-
-3. Enable IOMMU in the GRUB_CMDLINE_LINUX variable for your target kernel
-    a. For an AMD CPU
-        sudo sed 's/quiet/quiet amd_iommu=on iommu=pt/' /etc/sysconfig/grub
-    b. For an Intel CPU
-        sudo sed 's/quiet/quiet intel_iommu=on iommu=pt/' /etc/sysconfig/grub
-
-**b. Bind pass through device to vfio-pci**
-
-4. Preempt the host claiming the device by loading a stub driver
-    echo "options vfio-pci ids=1002:6861,1002:aaf8" | sudo tee -a /etc/modprobe.d/vfio.conf
-    echo "options vfio-pci disable_vga=1" | sudo tee -a /etc/modprobe.d/vfio.conf
-    sed 's/quiet/quiet rd.driver.pre=vfio-pci video=efifb:off/' /etc/sysconfig/grub
-    
-5. Update the kernel boot settings
-    sudo grub2-mkconfig -o /etc/grub2-efi.cfg
-    echo 'add_drivers+="vfio vfio_iommu_type1 vfio_pci"' | sudo tee -a /etc/dracut.conf.d/vfio.conf
-    sudo dracut -f --kver `uname -r`
-
-6. Reboot and verify that vfio-pci driver has been loaded
-    lspci -nnk
-
-**c. Pass through device to guest VM**
-
-1. Within virt-manager the device should now appear in the list of available PCI devices
-
-Note: To pass a device within a particular IOMMU group, all devices within that IOMMU group must also be passed.  You may wish to refer to https://wiki.archlinux.org/index.php/PCI_passthrough_via_OVMF for more details, such as the following script that lists all IOMMU groups and the devices within them.
-
-    #!/bin/bash
-    shopt -s nullglob
-    for d in /sys/kernel/iommu_groups/*/devices/*; do
-        n=${d#*/iommu_groups/*}; n=${n%%/*}
-	printf 'IOMMU Group %s ' "$n"
-	lspci -nns "${d##*/}"
-    done;
-
-		
-ROCm-Docker
-===========
-
- * `ROCm-Docker <https://github.com/RadeonOpenCompute/ROCm-docker>`_
-
-This repository contains a framework for building the software layers defined in the Radeon Open Compute Platform into portable docker images. The following are docker dependencies, which should be installed on the target machine.
-
- * Docker on `Ubuntu <https://docs.docker.com/v2.0/installation/ubuntulinux/>`_ systems or `Fedora systems <https://docs.docker.com/v2.0/installation/fedora/>`_
- * Highly recommended: `Docker-Compose <https://docs.docker.com/compose/install/>`_ to simplify container management
-   
-Docker Hub
-**********
-Looking for an easy start with ROCm + Docker? The rocm/rocm-terminal image is hosted on `Docker Hub <https://hub.docker.com/r/rocm/rocm-terminal/>`_ . After the `ROCm kernel is installed <https://rocm-documentation.readthedocs.io/en/latest/Installation_Guide/ROCK-Kernel-Driver_readme.html#opencomute-kernel-deriver>`_ , pull the image from Docker Hub and create a new instance of a container.
-
-::
-
-  sudo docker pull rocm/rocm-terminal
-  sudo docker run -it --rm --device="/dev/kfd" rocm/rocm-terminal
-  
-ROCm-docker set up guide
-*************************
-`Installation instructions <https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/quick-start.md>`_ and asciicasts demos are available to help users quickly get running with rocm-docker. Visit the set up guide to read more.
-
-**F.A.Q**
-
-When working with the ROCm containers, the following are common and useful docker commands:
-
- * A new docker container typically does not house apt repository meta-data. Before trying to install new software using apt, make    	 sure to run sudo apt update first
- * A message like the following typically means your user does not have permissions to execute docker; use sudo or `add your user <https://docs.docker.com/engine/installation/linux/docker-ce/ubuntu/>`_ to  	the docker group.
- * Cannot connect to the Docker daemon. Is the docker daemon running on this host?
- * Open another terminal into a running container
- * sudo docker exec -it <CONTAINER-NAME> bash -l
- * Copy files from host machine into running docker container
-    * sudo docker cp HOST_PATH <CONTAINER-NAME>:/PATH
- * Copy files from running docker container onto host machine
-    * sudo docker cp <CONTAINER-NAME>:/PATH/TO/FILE HOST_PATH
- * If receiving messages about no space left on device when pulling images, check the storage driver in use by the docker engine. If 	its 'device mapper', that means the image size limits imposed by the 'device mapper' storage driver are a problem
-   Follow the documentation in the :ref:`quickstart`  for a solution to change to the storage driver
-
-**Saving work in a container**
-
-Docker containers are typically ephemeral, and are discarded after closing the container with the '--rm' flag to docker run. However, there are times when it is desirable to close a container that has arbitrary work in it, and serialize it back into a docker image. This may be to to create a checkpoint in a long and complicated series of instructions, or it may be desired to share the image with others through a docker registry, such as docker hub.
-
-::
-
-  sudo docker ps -a  # Find container of interest
-  sudo docker commit <container-name> <new-image-name>
-  sudo docker images # Confirm existence of a new image
-
-
-Details
-*******
-Docker does not virtualize or package the linux kernel inside of an image or container. This is a design decision of docker to provide lightweight and fast containerization. The implication for this on the ROCm compute stack is that in order for the docker framework to function, the ROCm kernel and corresponding modules must be installed on the host machine. Containers share the host kernel, so the ROCm KFD component ROCK-Kernel-Driver1 functions outside of docker.
-
-**Installing ROCK on the host machine.**
-
-An `apt-get repository <http://rocm-documentation.readthedocs.io/en/latest/Installation_Guide/Installation-Guide.html>`_ is available to automate the installation of the required kernel and kernel modules.
-
-Building images
-****************
-There are two ways to install rocm components:
-
- 1.install from the rocm apt/rpm repository (packages.amd.com)
-
- 2.build the components from source and run install scripts
-
-The first method produces docker images with the smallest footprint and best building speed. The footprint is smaller because no developer tools need to be installed in the image, an the images build speed is fastest because typically downloading binaries is much faster than downloading source and then invoking a build process. Of course, building components allows much greater flexibility on install location and the ability to step through the source with debug builds. ROCm-docker supports making images either way, and depends on the flags passed to the setup script.
-
-The setup script included in this repository is provides some flexibility to how docker containers are constructed. Unfortunately, Dockerfiles do not have a preprocessor or template language, so typically build instructions are hardcoded. However, the setup script allows us to write a primitive 'template', and after running it instantiates baked dockerfiles with environment variables substituted in. For instance, if you wish to build release images and debug images, first run the setup script to generate release dockerfiles and build the images. Then, run the setup script again and specify debug dockerfiles and build new images. The docker images should generate unique image names and not conflict with each other.
-
-**setup.sh**
-
-Currently, the setup.sh scripts checks to make sure that it is running on an Ubuntu system, as it makes a few assumptions about the availability of tools and file locations. If running rocm on a Fedora machine, inspect the source of setup.sh and issue the appropriate commands manually. There are a few parameters to setup.sh of a generic nature that affects all images built after running. If no parameters are given, built images will be based off of Ubuntu 16.04 with rocm components installed from debians downloaded from packages.amd.com. Supported parameters can be queried with ./setup --help.
-
-============================ ======================== ===============================================
-setup.sh parameters		parameter [default]	description
-============================ ======================== ===============================================
---ubuntu			xx.yy [16.04]		Ubuntu version for to inherit base image
---install-docker-compose				helper to install the docker-compose tool
-============================ ======================== ===============================================
-
-The following parameters are specific to building containers that compile rocm components from source.
-
-============================ ======================== ===============================================
-setup.sh parameters		parameter [default]	description
-============================ ======================== ===============================================
---tag				string ['master']	string representing a git branch name
---branch			string ['master']	alias for tag
---debug							build code with debug flags
-============================ ======================== ===============================================
-
-./setup generates finalized Dockerfiles from textual template files ending with the .template suffix. Each sub-directory of this repository corresponds to a docker 'build context' responsible for a software layer in the ROCm stack. After running the script, each directory contains generated dockerfiles for building images from debians and from source.
-
-Docker compose
-*****************
-
-./setup prepares an environment to be controlled with Docker Compose. While docker-compose is not necessary for proper operation, it is highly recommended. setup.sh does provide a flag to simplify the installation of this tool. Docker-compose coordinates the relationships between the various ROCm software layers, and it remembers flags that should be passed to docker to expose devices and import volumes.
-
-**Example of using docker-compose**
-
-docker-compose.yml provides services that build and run containers. YAML is structured data, so it's easy to modify and extend. The setup.sh script generates a .env file that docker-compose reads to satisfy the definitions of the variables in the .yml file.
-
- * docker-compose run --rm rocm -- Run container using rocm packages
- * docker-compose run --rm rocm-from-src -- Run container with rocm built from source
-
-============================ =====================================================
-Docker-compose			description
-============================ =====================================================
-docker-compose		      docker compose executable
-run			      sub-command to bring up interactive container
---rm			      when shutting the container down, delete it
-rocm			      application service defined in docker-compose.yml
-============================ =====================================================
-
-**rocm-user has root privileges by default**
-
-The dockerfile that serves as a 'terminal' creates a non-root user called rocm-user. This container is meant to serve as a development environment (therefore apt-get is likely needed), the user has been added to the linux sudo group. Since it is somewhat difficult to set and change passwords in a container (often requiring a rebuild), the password prompt has been disabled for the sudo group. While this is convenient for development to be able sudo apt-get install packages, it does imply lower security in the container.
-
-To increase container security:
-
- 1.Eliminate the sudo-nopasswd COPY statement in the dockerfile and replace with
- 
- 2.Your own password with RUN echo 'account:password' | chpasswd
-
-**Footnotes:**
-
-[1] It can be installed into a container, it just doesn't do anything because containers do not go through the traditional boot process. We actually do provide a container for ROCK-Kernel-Driver, but it not used by the rest of the docker images. It does provide isolation and a reproducible environment for kernel development.
diff --git a/ROCm_Virtualization_Containers/quickstart.rst b/ROCm_Virtualization_Containers/quickstart.rst
index 4fbe4e3e..fdd0f844 100644
--- a/ROCm_Virtualization_Containers/quickstart.rst
+++ b/ROCm_Virtualization_Containers/quickstart.rst
@@ -16,7 +16,7 @@ It is my recommendation to install the rocm kernel first. Depending on how distr
 Step 1: Install rocm-kernel
 ****************************
 
-:: 
+::
 
   wget -qO - http://packages.amd.com/rocm/apt/debian/rocm.gpg.key | sudo apt-key add -
   sudo sh -c 'echo deb [arch=amd64] http://packages.amd.com/rocm/apt/debian/ trusty main  \
diff --git a/Remote_Device_Programming/Memoryhooks.rst b/Remote_Device_Programming/Memoryhooks.rst
index 9986eb39..23c53a93 100644
--- a/Remote_Device_Programming/Memoryhooks.rst
+++ b/Remote_Device_Programming/Memoryhooks.rst
@@ -45,7 +45,7 @@ We use the following algorithm to install the memory hooks:
 
  7. Sometimes it's enough to have hooks for mmap/... to get those events when they are called from malloc/... as well. So first we do 	  some memory allocations and check if we are able to get all events this way.
 
- 8. If we can't, install legacy malloc hooks (__malloc_hook). 
+ 8. If we can't, install legacy malloc hooks (__malloc_hook).
     We have our own implementation of heap manager in libucm - ptmalloc3. After we replace the original heap manager, we keep track 	of which pointers were allocated by our library, so we would know ignore all others (since they were allocated by the previous  	heap manager). Also, we can't restore the previous state, so libucm.so is marked as 'nodelete'.
 
  9. If the former didn't work, modify the relocation tables to point to our implementation of malloc (and friends).
diff --git a/Remote_Device_Programming/Performancemeasurement.rst b/Remote_Device_Programming/Performancemeasurement.rst
index 2bec57d7..b75d1dd7 100644
--- a/Remote_Device_Programming/Performancemeasurement.rst
+++ b/Remote_Device_Programming/Performancemeasurement.rst
@@ -13,7 +13,7 @@ Features of the library:
 
  * uct_perf_test_run() is the function which runs the test. (currently only UCT API is supported)
  * No need to do any resource allocation - just pass the testing parameters to the API
- * Requires running the function on 2 threads/processes/nodes - by passing RTE callbacks which are used to bootstrap the connections. 
+ * Requires running the function on 2 threads/processes/nodes - by passing RTE callbacks which are used to bootstrap the connections.
  * Two testing modes - ping-pong and unidirectional stream (TBD bi-directional stream)
  * Configurabe message size, and data layout (short/bcopy/zcopy)
  * Supports: warmup cycles, unlimited iterations.
@@ -73,7 +73,7 @@ Features of ucx_perftest:
                         Every line of the file is a test to run. The first word is the
                         test name, and the rest are command-line arguments for the test.
       -h             Show this help message.
- 
+
     Server options:
        -l             Accept clients in an infinite loop
 
@@ -109,4 +109,4 @@ When using mpi as the launcher to run ucx_perftest, please make sure that your u
   | # iterations | typical | average | overall |  average |  overall |   average |   overall |
   +--------------+---------+---------+---------+----------+----------+-----------+-----------+
         586527     0.845     0.852     0.852       4.47       4.47      586527      586527
-       1000000     0.844     0.848     0.851       4.50       4.48      589339   
+       1000000     0.844     0.848     0.851       4.50       4.48      589339
diff --git a/Remote_Device_Programming/PrintUCXinfo.rst b/Remote_Device_Programming/PrintUCXinfo.rst
index ea336fbe..2fdd8bb4 100644
--- a/Remote_Device_Programming/PrintUCXinfo.rst
+++ b/Remote_Device_Programming/PrintUCXinfo.rst
@@ -31,7 +31,7 @@ Sample output:
 
 ::
 
-  # Transport: rc 
+  # Transport: rc
   #
   #   mlx5_0:1
   #      speed:         6502.32 MB/sec
@@ -48,8 +48,8 @@ Sample output:
   #           atomic_add: 32, 64 bit
   #          atomic_fadd: 32, 64 bit
   #          atomic_swap: 32, 64 bit
-  #           atomic_cswap: 32, 64 bit 
-  #        error handling: none 
+  #           atomic_cswap: 32, 64 bit
+  #        error handling: none
   #
   #   mlx4_0:1
   #      speed:         6502.32 MB/sec
@@ -83,7 +83,7 @@ Sample output:
   #            am header: <= 127
   #           atomic_add: 64 bit
   #          atomic_fadd: 64 bit
-  #          atomic_swap: 64 bit 
+  #          atomic_swap: 64 bit
   #         atomic_cswap: 64 bit
-  #       error handling: none 
+  #       error handling: none
   #
diff --git a/Remote_Device_Programming/Remote-Device-Programming.rst b/Remote_Device_Programming/Remote-Device-Programming.rst
index 655cbd77..e8ad6ab1 100644
--- a/Remote_Device_Programming/Remote-Device-Programming.rst
+++ b/Remote_Device_Programming/Remote-Device-Programming.rst
@@ -10,12 +10,12 @@ ROCmRDMA
 **Peer-to-Peer bridge driver for PeerDirect - Deprecated Repo**
 
 This is now included as part of the ROCK `Kernel Driver <https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver>`_
-ROCmRDMA is the solution designed to allow third-party kernel drivers to utilize DMA access to the GPU  memory. It allows direct path for data exchange (peer-to-peer) using the standard features of PCI Express. 
+ROCmRDMA is the solution designed to allow third-party kernel drivers to utilize DMA access to the GPU  memory. It allows direct path for data exchange (peer-to-peer) using the standard features of PCI Express.
 
 Currently ROCmRDMA provides the following benefits:
 
  * Direct access to ROCm memory for 3rd party PCIe devices
- * Support for PeerDirect(c) interface to offloads the CPU when dealing 
+ * Support for PeerDirect(c) interface to offloads the CPU when dealing
    with ROCm memory for RDMA network stacks;
 
 Restrictions and limitations
@@ -31,11 +31,11 @@ ROCmRDMA interface specification
 The implementation of ROCmRDMA interface could be found in `[amd_rdma.h] <https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/blob/roc-2.1.x/include/drm/amd_rdma.h>`_ file.
 
 Data structures
-*************** 
+***************
+
+::
+
 
-:: 
-   
-  
    /**
     * Structure describing information needed to P2P access from another device
     * to specific location of GPU memory
@@ -44,17 +44,17 @@ Data structures
   	   uint64_t	   va;		   /**< Specify user virt. address
 					     * which this page table described
 					     */
-	 
+
 	   uint64_t	   size;	   /**< Specify total size of
 					     * allocation
 					     */
-	  
+
 	  struct pid	   *pid;	   /**< Specify process pid to which
 					     * virtual address belongs
 					     */
-	 
+
 	  struct sg_table *pages;	   /**< Specify DMA/Bus addresses */
-	
+
 	   void		*priv;		   /**< Pointer set by AMD kernel
 					      * driver
 					      */
@@ -66,7 +66,7 @@ Data structures
    * Structure providing function pointers to support rdma/p2p requirements.
    * to specific location of GPU memory
    */
-   
+
    struct amd_rdma_interface {
   	  int (*get_pages)(uint64_t address, uint64_t length, struct pid *pid,
 				  struct amd_p2p_info  **amd_p2p_data,
@@ -77,13 +77,13 @@ Data structures
 	  int (*get_page_size)(uint64_t address, uint64_t length, struct pid *pid,
 				  unsigned long *page_size);
   };
- 
+
 The function to query ROCmRDMA interface
 ****************************************
 
 ::
 
-  
+
    /**
     * amdkfd_query_rdma_interface - Return interface (function pointers table) for
     *				 rdma interface
@@ -93,28 +93,28 @@ The function to query ROCmRDMA interface
     *    \return 0 if operation was successful.
     */
    int amdkfd_query_rdma_interface(const struct amd_rdma_interface **rdma);
-   
+
 
 The function to query ROCmRDMA interface
 ****************************************
 
 ::
 
-   
+
    /**
     * amdkfd_query_rdma_interface - Return interface (function pointers table) for rdma interface
     * \param interace     - OUT: Pointer to interface
     * \return 0 if operation was successful.
     */
     int amdkfd_query_rdma_interface(const struct amd_rdma_interface **rdma);
-   
+
 
 ROCmRDMA interface functions description
 *****************************************
 
-:: 
+::
+
 
-   
    /**
     * This function makes the pages underlying a range of GPU virtual memory
     * accessible for DMA operations from another PCIe device
@@ -153,7 +153,7 @@ ROCmRDMA interface functions description
     int put_pages(struct amd_p2p_info **p_p2p_data)
 
 ::
-   
+
   /**
     * Check if given address belongs to GPU address space.
     * \param   address - Address to check
@@ -174,8 +174,8 @@ ROCmRDMA interface functions description
    :param   pid       - Process id structure. Could be NULL if current one.
    :param   page_size - On return: Page size
    :rtype:return  0 if operation was successful
-     
-    
+
+
 
 UCX
 ====
@@ -217,7 +217,7 @@ MPI
 Example of the command line (for InfiniBand RC + shared memory):
 
 ::
-  
+
   $ mpirun -np 2 -mca pml ucx -x UCX_NET_DEVICES=mlx5_0:1 -x UCX_TLS=rc,sm ./app
 
 
@@ -238,7 +238,7 @@ Example of the command line (for InfiniBand RC + shared memory):
  2. The table of MPI and OpenSHMEM distributions that are tested with the HEAD of UCX master
 
 ================ ===========
-MPI/OpenSHMEM     project	
+MPI/OpenSHMEM     project
 OpenMPI/OSHMEM     2.1.0
 MPICH		   Latest
 ================ ===========
@@ -257,22 +257,22 @@ IPC API
 **New datatypes**
 
 ::
- 
+
  hsa_amd_ipc_memory_handle_t
- 
+
  /** IPC memory handle to by passed from one process to another */
  typedef struct  hsa_amd_ipc_memory_handle_s {
        uint64_t handle;
  } hsa_amd_ipc_memory_handle_t;
-  
+
  hsa_amd_ipc_signal_handle_t
-  
+
  /** IPC signal  handle to by passed from one process to another */
  typedef struct  hsa_amd_ipc_signal_handle_s {
       uint64_t handle;
  } hsa_amd_ipc_signal_handle_t;
 
-  
+
 **Memory sharing API**
 
 Allows sharing of HSA allocated memory between different processes.
@@ -285,9 +285,9 @@ Allows sharing of HSA allocated memory between different processes.
 | hsa_amd_ipc_get_memory_handle(void *ptr, hsa_amd_ipc_memory_handle_t *ipc_handle);
 | where:
 |     IN:    ptr - Pointer to memory previously allocated via hsa_amd_memory_pool_allocate() call
-|     OUT:   ipc_handle - Unique IPC handle to be used in IPC. 
-|                         Application must pass this handle to another process.      
-| 
+|     OUT:   ipc_handle - Unique IPC handle to be used in IPC.
+|                         Application must pass this handle to another process.
+|
 | hsa_amd_ipc_close_memory_handle
 | Close IPC memory handle previously received via "hsa_amd_ipc_get_memory_handle()" call .
 
@@ -297,7 +297,7 @@ Allows sharing of HSA allocated memory between different processes.
 | where:
 |    IN: ipc_handle - IPC Handle to close
 |
-| 
+|
 | hsa_amd_ipc_open_memory_handle
 | Open / import an IPC memory handle exported from another process and return address to be used in the current process.
 
@@ -322,9 +322,9 @@ Allows sharing of HSA allocated memory between different processes.
 | hsa_amd_ipc_get_signal_handle(hsa_signal_t signal, hsa_amd_ipc_signal_handle_t *ipc_handle);
 | where:
 |     IN:    signal     - Signal handle created as the result of hsa_signal_create() call.
-|     OUT:   ipc_handle - Unique IPC handle to be used in IPC. 
-|                         Application must pass this handle to another process.      
-| 
+|     OUT:   ipc_handle - Unique IPC handle to be used in IPC.
+|                         Application must pass this handle to another process.
+|
 | hsa_amd_ipc_close_signal_handle
 | Close IPC signal handle previously received via "hsa_amd_ipc_get_signal_handle()" call .
 
@@ -353,26 +353,26 @@ Client should call hsa_signal_destroy() when access to this resource is not need
 Allows query information about memory resource based on address. It is partially overlapped with the following requirement Memory info interface so it may be possible to merge those two interfaces.
 ::
  typedef enum hsa_amd_address_info_s {
-     
+
      /* Return uint32_t  / boolean if address was allocated via  HSA stack */
      HSA_AMD_ADDRESS_HSA_ALLOCATED = 0x1,
- 
+
      /** Return agent where such memory was allocated */
      HSA_AMD_ADDRESS_AGENT = 0x2,
- 
+
      /** Return pool from which this address was allocated  */
      HSA_AMD_ADDRESS_POOL = 0x3,
- 
+
      /** Return size of allocation   */
      HSA_AMD_ADDRESS_ALLOC_SIZE = 0x4
- 
+
   } hsa_amd_address_info_t;
 
 
 **hsa_status_t HSA_API**
 
 | hsa_amd_get_address_info(void *ptr,  hsa_amd_address_info_t attribute,   void* value);
-| where: 
+| where:
 |      ptr         - Address information about which to query
 |      attribute   - Attribute to query
 
diff --git a/Remote_Device_Programming/UCP-Design.rst b/Remote_Device_Programming/UCP-Design.rst
index 487c8572..5bb7e00f 100644
--- a/Remote_Device_Programming/UCP-Design.rst
+++ b/Remote_Device_Programming/UCP-Design.rst
@@ -26,7 +26,7 @@ MPI Tag Matching strategies
 
 Data specification
 ********************
-  
+
  * Contiguous data (no lkey required)
  * Non-contiguous data with strides and hierarchy, but without memory key
  * Pack/unpack callbacks
diff --git a/Remote_Device_Programming/UCT-Design.rst b/Remote_Device_Programming/UCT-Design.rst
index e156700b..95b332d3 100644
--- a/Remote_Device_Programming/UCT-Design.rst
+++ b/Remote_Device_Programming/UCT-Design.rst
@@ -12,7 +12,7 @@ The library will contain an abstraction layer called "transport" or "tl". It ena
 
 Communication primitives
 *************************
- * Remote memory access: 
+ * Remote memory access:
     * put
     * get
  * Remote memory atomics:
@@ -147,7 +147,7 @@ Data specifications
  * single-dimension scatter/gather - iovec (can be either local or remote)
     * iovec element has: pointer, length, stride, count, key / iovec+len
     * the key should have been obtained from mmap functions.
-    * transport exposes its max number of entries in the iovec  
+    * transport exposes its max number of entries in the iovec
     * IB implementation note: tl will post umr-s in correct order as needed, with temporary memory keys.
  * atomics - pass the arguments directly without local key, since cost of copying the result is negligible.
 
diff --git a/Remote_Device_Programming/logging.rst b/Remote_Device_Programming/logging.rst
index 04cf14b7..c21f3d63 100644
--- a/Remote_Device_Programming/logging.rst
+++ b/Remote_Device_Programming/logging.rst
@@ -9,10 +9,10 @@ UCS has logging infrastructure. logging is controlled by a single level:
  * fatal - stops the program
  * error - an error which does not stop the program and can be reported back to user.
  * warn - a warning which does not return error to the user.
- 
+
 info
  * debug - debugging messages, low volume, about initialization/cleanup.
- * trace - debugging messages, high volume, during runtime, for “special” events.
+ * trace - debugging messages, high volume, during runtime, for "special" events.
  * req - details of every send/receive request and tag matching.
  * data - headers of every packet being sent/received.
  * async - async notifications and progress thread.
diff --git a/Remote_Device_Programming/profiling.rst b/Remote_Device_Programming/profiling.rst
index 81e21f2e..0cf36f4c 100644
--- a/Remote_Device_Programming/profiling.rst
+++ b/Remote_Device_Programming/profiling.rst
@@ -50,8 +50,8 @@ Run an application and collect profile:
 Read profile output file:
 
 ::
- 
-  $ ucx_read_profile ucx.prof      
+
+  $ ucx_read_profile ucx.prof
 
    command : ./app
    host    : my_host
diff --git a/Remote_Device_Programming/reference b/Remote_Device_Programming/reference
index 4502f7f4..727407a3 100644
--- a/Remote_Device_Programming/reference
+++ b/Remote_Device_Programming/reference
@@ -3,4 +3,4 @@ This section consists of UCX documentation from the following sites:
 https://www.openucx.org/introduction
 https://github.com/openucx/ucx/wiki/High-Level-design
 https://github.com/openucx/ucx/wiki/Infrastructure-and-Tools
-https://github.com/openucx/ucx/wiki/FAQ 
+https://github.com/openucx/ucx/wiki/FAQ
diff --git a/Remote_Device_Programming/sideprogresscompletion.rst b/Remote_Device_Programming/sideprogresscompletion.rst
index 98c0a2fc..a94ff9e2 100644
--- a/Remote_Device_Programming/sideprogresscompletion.rst
+++ b/Remote_Device_Programming/sideprogresscompletion.rst
@@ -12,7 +12,7 @@ On the low level, we can consider 2 types of operations: bcopy (including short)
 
 ::
 
-  ucs_status_t uct_XXX_bcopy(uct_ep_h ep, ..., uint32_t flags); 
+  ucs_status_t uct_XXX_bcopy(uct_ep_h ep, ..., uint32_t flags);
   ucs_status_t ucx_XXX_zcopy(uct_ep_h ep, ..., uint32_t flags, uct_req_t *req);
 
   typedef struct uct_req {
@@ -29,8 +29,8 @@ These functions will behave as follows:
 
 Implementation notes:
 
- * The transport might limit the amount of sends to single endpoint without considering other endpoints, to enforce fairness. In that 	 case, if the limit is reached, the send will return UCS_ERR_WOULD_BLOCK. 
-  
+ * The transport might limit the amount of sends to single endpoint without considering other endpoints, to enforce fairness. In that 	 case, if the limit is reached, the send will return UCS_ERR_WOULD_BLOCK.
+
 Protocol layer - Nonblocking MPI
 **********************************
 
diff --git a/Tutorial/GCN-asm-tutorial.rst b/Tutorial/GCN-asm-tutorial.rst
index 75ff5d2b..a3307b1c 100644
--- a/Tutorial/GCN-asm-tutorial.rst
+++ b/Tutorial/GCN-asm-tutorial.rst
@@ -7,16 +7,16 @@ GCN asm Tutorial
 
 The Art of AMDGCN Assembly: How to Bend the Machine to Your Will
 ******************************************************************
-The ability to write code in assembly is essential to achieving the best performance for a GPU program. In a previous blog  we described how to combine several languages in a single program using ROCm and Hsaco. This article explains how to produce Hsaco from assembly code and also takes a closer look at some new features of the GCN architecture. I'd like to thank Ilya Perminov of Luxsoft for co-authoring this blog post. Programs written for GPUs should achieve the highest performance possible. Even carefully written ones, however, won’t always employ 100% of the GPU’s capabilities. Some reasons are the following:
+The ability to write code in assembly is essential to achieving the best performance for a GPU program. In a previous blog  we described how to combine several languages in a single program using ROCm and Hsaco. This article explains how to produce Hsaco from assembly code and also takes a closer look at some new features of the GCN architecture. I'd like to thank Ilya Perminov of Luxsoft for co-authoring this blog post. Programs written for GPUs should achieve the highest performance possible. Even carefully written ones, however, won't always employ 100% of the GPU's capabilities. Some reasons are the following:
 
  * The program may be written in a high level language that does not expose all of the features available on the hardware.
- * The compiler is unable to produce optimal ISA code, either because the compiler needs to ‘play it safe’ while adhering to the     	semantics of a language or because the compiler itself is generating un-optimized code.
+ * The compiler is unable to produce optimal ISA code, either because the compiler needs to 'play it safe' while adhering to the     	semantics of a language or because the compiler itself is generating un-optimized code.
 
-Consider a program that uses one of GCN’s new features (source code is available on `GitHub <https://github.com/RadeonOpenCompute/LLVM-AMDGPU-Assembler-Extra>`_). Recent hardware architecture updates—DPP and DS Permute instructions—enable efficient data sharing between wavefront lanes. To become more familiar with the instruction set, review the `GCN ISA Reference Guide <https://github.com/olvaffe/gpu-docs/blob/master/amd-open-gpu-docs/AMD_GCN3_Instruction_Set_Architecture.pdf>`_. Note: the assembler is currently experimental; some of syntax we describe may change.
+Consider a program that uses one of GCN's new features (source code is available on `GitHub <https://github.com/RadeonOpenCompute/LLVM-AMDGPU-Assembler-Extra>`_). Recent hardware architecture updates--DPP and DS Permute instructions--enable efficient data sharing between wavefront lanes. To become more familiar with the instruction set, review the `GCN ISA Reference Guide <https://github.com/olvaffe/gpu-docs/blob/master/amd-open-gpu-docs/AMD_GCN3_Instruction_Set_Architecture.pdf>`_. Note: the assembler is currently experimental; some of syntax we describe may change.
 
 DS Permute Instructions
 **************************
-Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to move between lanes on the basis of an index from another VGPR. These instructions use LDS hardware to route data between the 64 lanes, but they don’t write to LDS memory. The difference between them is what to index: the source-lane ID or the destination-lane ID. In other words, ds_permute_b32 says “put my lane data in lane i,” and ds_bpermute_b32 says “read data from lane i.” The GCN ISA Reference Guide provides a more formal description. The test kernel is simple: read the initial data and indices from memory into GPRs, do the permutation in the GPRs and write the data back to memory. An analogous OpenCL kernel would have this form:
+Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to move between lanes on the basis of an index from another VGPR. These instructions use LDS hardware to route data between the 64 lanes, but they don't write to LDS memory. The difference between them is what to index: the source-lane ID or the destination-lane ID. In other words, ds_permute_b32 says "put my lane data in lane i," and ds_bpermute_b32 says "read data from lane i." The GCN ISA Reference Guide provides a more formal description. The test kernel is simple: read the initial data and indices from memory into GPRs, do the permutation in the GPRs and write the data back to memory. An analogous OpenCL kernel would have this form:
 
 ::
 
@@ -28,7 +28,7 @@ Two new instructions, ds_permute_b32 and ds_bpermute_b32, allow VGPR data to mov
 
 Passing Parameters to a Kernel
 *******************************
-Formal HSA arguments are passed to a kernel using a special read-only memory segment called kernarg. Before a wavefront starts, the base address of the kernarg segment is written to an SGPR pair. The memory layout of variables in kernarg must employ the same order as the list of kernel formal arguments, starting at offset 0, with no padding between variables—except to honor the requirements of natural alignment and any align qualifier. The example host program must create the kernarg segment and fill it with the buffer base addresses. The HSA host code might look like the following:
+Formal HSA arguments are passed to a kernel using a special read-only memory segment called kernarg. Before a wavefront starts, the base address of the kernarg segment is written to an SGPR pair. The memory layout of variables in kernarg must employ the same order as the list of kernel formal arguments, starting at offset 0, with no padding between variables--except to honor the requirements of natural alignment and any align qualifier. The example host program must create the kernarg segment and fill it with the buffer base addresses. The HSA host code might look like the following:
 
 ::
 
@@ -50,7 +50,7 @@ Formal HSA arguments are passed to a kernel using a special read-only memory seg
   aql->kernarg_address = args;
   /*
   * Write the args directly to the kernargs buffer;
-  * the code assumes that memory is already allocated for the 
+  * the code assumes that memory is already allocated for the
   * buffers that in_ptr, index_ptr and out_ptr point to
   */
   args->in = in_ptr;
@@ -71,9 +71,9 @@ The host program should also allocate memory for the in, index and out buffers.
   out = AllocateBuffer(size);
 
   // Fill Kernarg memory
-  Kernarg(in); // Add base pointer to “in” buffer
-  Kernarg(index); // Append base pointer to “index” buffer
-  Kernarg(out); // Append base pointer to “out” buffer
+  Kernarg(in); // Add base pointer to "in" buffer
+  Kernarg(index); // Append base pointer to "index" buffer
+  Kernarg(out); // Append base pointer to "out" buffer
 
 Initial Wavefront and Register State To launch a kernel in real hardware, the run time needs information about the kernel, such as
 
@@ -91,7 +91,7 @@ Initial Wavefront and Register State To launch a kernel in real hardware, the ru
    .text
    .p2align 8
    .amdgpu_hsa_kernel hello_world
- 
+
    hello_world:
 
    .amd_kernel_code_t
@@ -131,7 +131,7 @@ Currently, a programmer must manually set all non-default values to provide the
 
 The GPR Counting
 ******************
-The next amd_kernel_code_t fields are obvious: is_ptr64 = 1 says we are in 64-bit mode, and kernarg_segment_byte_size = 24 describes the kernarg segment size. The GPR counting is less straightforward, however. The workitem_vgpr_count holds the number of vector registers that each work item uses, and wavefront_sgpr_count holds the number of scalar registers that a wavefront uses. The code above employs v0–v4, so workitem_vgpr_count = 5. But wavefront_sgpr_count = 8 even though the code only shows s0–s5, since the special registers VCC, FLAT_SCRATCH and XNACK are physically stored as part of the wavefront’s SGPRs in the highest-numbered SGPRs. In this example, FLAT_SCRATCH and XNACK are disabled, so VCC has only two additional registers. In current GCN3 hardware, VGPRs are allocated in groups of 4 registers and SGPRs in groups of 16. Previous generations (GCN1 and GCN2) have a VGPR granularity of 4 registers and an SGPR granularity of 8 registers. The fields compute_pgm_rsrc1_*gprs contain a device-specific number for each register-block type to allocate for a wavefront. As we said previously, future updates may enable automatic counting, but for now you can use following formulas for all three GCN GPU generations:
+The next amd_kernel_code_t fields are obvious: is_ptr64 = 1 says we are in 64-bit mode, and kernarg_segment_byte_size = 24 describes the kernarg segment size. The GPR counting is less straightforward, however. The workitem_vgpr_count holds the number of vector registers that each work item uses, and wavefront_sgpr_count holds the number of scalar registers that a wavefront uses. The code above employs v0-v4, so workitem_vgpr_count = 5. But wavefront_sgpr_count = 8 even though the code only shows s0-s5, since the special registers VCC, FLAT_SCRATCH and XNACK are physically stored as part of the wavefront's SGPRs in the highest-numbered SGPRs. In this example, FLAT_SCRATCH and XNACK are disabled, so VCC has only two additional registers. In current GCN3 hardware, VGPRs are allocated in groups of 4 registers and SGPRs in groups of 16. Previous generations (GCN1 and GCN2) have a VGPR granularity of 4 registers and an SGPR granularity of 8 registers. The fields compute_pgm_rsrc1_*gprs contain a device-specific number for each register-block type to allocate for a wavefront. As we said previously, future updates may enable automatic counting, but for now you can use following formulas for all three GCN GPU generations:
 
 ::
 
diff --git a/Tutorial/Optimizing-Dispatches.rst b/Tutorial/Optimizing-Dispatches.rst
index 38ba7e97..2ffda4ca 100644
--- a/Tutorial/Optimizing-Dispatches.rst
+++ b/Tutorial/Optimizing-Dispatches.rst
@@ -7,18 +7,18 @@ Optimizing-Dispatches
 ROCm with Rapid Harmony : Optimizing HSA Dispatch
 ######################################################
 
-We `previously <https://rocm.github.io/rocncloc.html>`_ looked at how to launch an OpenCL™ kernel using the HSA runtime. That example showed the basics of using the HSA Runtime. `Here <https://github.com/ROCm-Developer-Tools/HCC-Example-Application/tree/master/BitonicSort-CL-from-HCC>`_ we'll turn up the tempo a bit by optimizing the launch code - moving some expensive operations into the setup code (rather than on each dispatch), removing host-side synchronization, and optimizing the memory fences to the bare minimum required.  We'll measure the contributions of the different optimizations and discuss the results.The code is available at the `same GitHub repository <https://github.com/ROCm-Developer-Tools/HCC-Example-Application/tree/master/BitonicSort-CL-from-HCC>`_ as before and the optimizations can be enabled with a series of command-line switches.
+We `previously <https://rocm.github.io/rocncloc.html>`_ looked at how to launch an OpenCL(TM) kernel using the HSA runtime. That example showed the basics of using the HSA Runtime. `Here <https://github.com/ROCm-Developer-Tools/HCC-Example-Application/tree/master/BitonicSort-CL-from-HCC>`_ we'll turn up the tempo a bit by optimizing the launch code - moving some expensive operations into the setup code (rather than on each dispatch), removing host-side synchronization, and optimizing the memory fences to the bare minimum required.  We'll measure the contributions of the different optimizations and discuss the results.The code is available at the `same GitHub repository <https://github.com/ROCm-Developer-Tools/HCC-Example-Application/tree/master/BitonicSort-CL-from-HCC>`_ as before and the optimizations can be enabled with a series of command-line switches.
 
 Optimizing
 #############
 Bitonic sort involves running the same kernel several times. For the default array length of 32768, the algorithm launches 120 kernels.  The original OpenCL code and the associated port used in the example synchronize with the host after each of the kernel code.  To improve performance, we can submit all 120 kernels at one time, and only synchronize with the host after the last one completes. To make this change, we will need to restructure the BitonicSort::run call as follows:
 
- * Each kernel still needs to wait for the previous kernel to finish executing. The AQL packet in the HSA system architecture defines 	 a “barrier” bit which provides exactly this synchronization – packets with the barrier bit set will wait for all preceding kernels 	 in the same queue to complete before beginning their own execution.  Barrier-bit synchronization only works for commands in the   	same queue, but will be more efficient than using signals in the cases where it applies.  So we’ll set the barrier bit for all the 	 kernels to provide the required synchronization between kernels, and therefore will only need to use a completion_signal for the   	 last kernel in the sequence.  (all other kernels set the completion_signal to 0, which saves an atomic decrement operation when   	the command finishes. )  This optimization is marked with p_optPreallocSignal.
+ * Each kernel still needs to wait for the previous kernel to finish executing. The AQL packet in the HSA system architecture defines 	 a "barrier" bit which provides exactly this synchronization - packets with the barrier bit set will wait for all preceding kernels 	 in the same queue to complete before beginning their own execution.  Barrier-bit synchronization only works for commands in the   	same queue, but will be more efficient than using signals in the cases where it applies.  So we'll set the barrier bit for all the 	 kernels to provide the required synchronization between kernels, and therefore will only need to use a completion_signal for the   	 last kernel in the sequence.  (all other kernels set the completion_signal to 0, which saves an atomic decrement operation when   	the command finishes. )  This optimization is marked with p_optPreallocSignal.
 
- * In HSA, each kernel submission requires a block of “kernarg” memory to hold the kernel arguments. The  baseline implementation    	allocates a single kernarg block and re-uses it for each kernel submission.  In the optimized version, we submit all the kernels  	at the same time, but with different kernel arguments, so we must ensure that each kernel has its own kernarg block.  The code    	actually performs a single kernarg allocation with enough space to cover all of the inflight kernels.   Additionally, the code    	aligns each kernarg block on a 64-byte cache line boundary.  This avoids false-sharing cases where the GPU is reading kernargs for 	 one command while the host is writing arguments for another kernel, causing the cache line to ping-pong between CPU and GPU 	     	caches.   The kernarg optimizations are marked with p_optPreallocKernarg.
+ * In HSA, each kernel submission requires a block of "kernarg" memory to hold the kernel arguments. The  baseline implementation    	allocates a single kernarg block and re-uses it for each kernel submission.  In the optimized version, we submit all the kernels  	at the same time, but with different kernel arguments, so we must ensure that each kernel has its own kernarg block.  The code    	actually performs a single kernarg allocation with enough space to cover all of the inflight kernels.   Additionally, the code    	aligns each kernarg block on a 64-byte cache line boundary.  This avoids false-sharing cases where the GPU is reading kernargs for 	 one command while the host is writing arguments for another kernel, causing the cache line to ping-pong between CPU and GPU 	     	caches.   The kernarg optimizations are marked with p_optPreallocKernarg.
 
  * The function bitonicSortGPU_opt contains the optimized loop which submits the batch of 120 kernels to the GPU.  This code is      	marked with o_optAvoidHostSync).
-   
+
  * Each AQL kernel dispatch packet contains a field that controls the memory fences applied before and after the kernel executes.  In 	 the baseline implementation, the fences conservatively specify system visibility for both acquire and release fences.  (The 	     	subject of fences and what they control is well beyond the scope of this document but it covered extensively in the HSA System    	Architecture Specification Memory Model.  It turns out we can make a more surgical use of these fences in the optimized version:   	(code marked with p_optFence)
 
  * The first kernel needs a system acquire fence to make sure it gets the data from the host->device copy.
@@ -75,7 +75,7 @@ The timing numbers shown here includes the time to transfer the array to the GPU
 +----------------------+----------+--------------------+-----------------------+-----------------+------------+-------------------+
 |RunTime/Iteration (us)| 1943 	  |      1906 	       |        1869 	       |     1665        |   1221     |	   1137           |
 +----------------------+----------+--------------------+-----------------------+-----------------+------------+-------------------+
-|Delta/Iteration(us)   |          |	-37 	       |       -37 	       |      -204 	 |   -444     |     -84           | 
+|Delta/Iteration(us)   |          |	-37 	       |       -37 	       |      -204 	 |   -444     |     -84           |
 +----------------------+----------+--------------------+-----------------------+-----------------+------------+-------------------+
 
 
@@ -88,7 +88,7 @@ The system-scope fences are fairly expensive - Fiji has a 2MB L2 cache, and it t
 Finally, using pinned host memory improves the transfer speeds from around 6GB/s to 14GB/s.    In this workload, we see a modest performance improvement (84us) since most of the benchmark is spent running the kernels and synchronizing between them.
 
 Overall the performance improvement from these optimizations is 1.7X faster than the baseline version.
- 
+
 Reference
 ###########
-`Wikipedia <https://en.m.wikipedia.org/wiki/Bitonic_sorter>`_ has a nice description of the Bitonic sort algorithm, including pictures. Eric Bainville wrote a nice explanation `here <http://www.bealto.com/gpu-sorting_intro.html>`_ describing how to optimize Bitonic Sort for the GPU. 
+`Wikipedia <https://en.m.wikipedia.org/wiki/Bitonic_sorter>`_ has a nice description of the Bitonic sort algorithm, including pictures. Eric Bainville wrote a nice explanation `here <http://www.bealto.com/gpu-sorting_intro.html>`_ describing how to optimize Bitonic Sort for the GPU.
diff --git a/Tutorial/ROCm-MultiGPU.rst b/Tutorial/ROCm-MultiGPU.rst
index 3104582f..f891dd4b 100644
--- a/Tutorial/ROCm-MultiGPU.rst
+++ b/Tutorial/ROCm-MultiGPU.rst
@@ -9,7 +9,7 @@ In-node
 * ROCr Base driver has P2P API support
 * `ROCr (HSA) AGENT API with Peer to Peer support <http://www.hsafoundation.com/html_spec111/HSA_Library.htm#Runtime/Topics/02_Core/hsa_iterate_agents.htm%3FTocPath%3DHSA%2520Runtime%2520Programmer%25E2%2580%2599s%2520Reference%2520Manual%2520Version%25201.1.1%2520%7CChapter%25202.%2520HSA%2520Core%2520Programming%2520Guide%7C2.3%2520System%2520and%2520agent%2520information%7C2.3.1%2520System%2520and%2520agent%2520information%2520API%7C_____18>`_.
 * `HCC Language Runtime support of P2P	ROCr Agent API <https://scchan.github.io/hcc/classhc_1_1accelerator.html#aebd49b998f9421bd032ea450cbafd247>`_.
-* `HIP Language Runtime support of P2P	P2P API’s model after CUDA P2P API’s <http://rocm-developer-tools.github.io/HIP/group__PeerToPeer.html>`_.
+* `HIP Language Runtime support of P2P	P2P API's model after CUDA P2P API's <http://rocm-developer-tools.github.io/HIP/group__PeerToPeer.html>`_.
 * OpenCL Language Runtime P2P API	Peer-to-Peer API  with Autocopy support over Intel QPI bus
    * API name -  clEnqueueBufferCopyP2PAMD
    * Releasing in OpenCL with ROCm 1.6.2
@@ -26,7 +26,7 @@ Out of Node
 
 Standard Frameworks for Out of Node Communication
 ---------------------------------------------------
-* `OpenUCX UCX is a communication library implementing high-performance messaging for MPI/PGAS frameworks - In Development <http://www.openucx.org./>`_ `Source for ROCm <https://github.com/openucx/ucx/tree/master/src/uct/rocm>`_. 
+* `OpenUCX UCX is a communication library implementing high-performance messaging for MPI/PGAS frameworks - In Development <http://www.openucx.org./>`_ `Source for ROCm <https://github.com/openucx/ucx/tree/master/src/uct/rocm>`_.
 * `OpenMPI Open MPI Project is an open source Message Passing Interface https://www.open-mpi.org In Development <https://github.com/openucx/ucx/wiki/OpenMPI-and-OpenSHMEM-installation-with-UCX>`_.
 * `MPICH MPICH is a high-performance and widely portable implementation of the Message Passing Interface (MPI) standard (MPI-1, MPI-2 and MPI-3) <https://www.mpich.org/about/overview/>`_ `In Development <https://www.mpich.org/2016/08/30/mpich-3-3a1-released/>`_.
 * `OpenSHMEM	Partitioned Global Address Space & Communication Library - In Development <https://github.com/openucx/ucx/wiki/OpenMPI-and-OpenSHMEM-installation-with-UCX>`_.
diff --git a/Tutorial/Tutorial.rst b/Tutorial/Tutorial.rst
index 9c1f7510..65c37de6 100644
--- a/Tutorial/Tutorial.rst
+++ b/Tutorial/Tutorial.rst
@@ -17,8 +17,8 @@ Tutorial
 
   * :ref:`rocncloc` ROCm With Harmony: Combining OpenCL Kernels, HCC and HSA in a Single Program. This tutorial 	      	   	demonstrates how to compile OpenCL kernels using the CL offline compiler (CLOC) and integrate them with HCC C++ compiled ROCm    	   applications.
 
-  * `The AMD GCN Architecture - A Crash Course, by Layla Mah <https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah>`_ 
+  * `The AMD GCN Architecture - A Crash Course, by Layla Mah <https://www.slideshare.net/DevCentralAMD/gs4106-the-amd-gcn-architecture-a-crash-course-by-layla-mah>`_
 
-  * `AMD GCN Architecture White paper <https://www.amd.com/Documents/GCN_Architecture_whitepaper.pdf>`_ 
+  * `AMD GCN Architecture White paper <https://www.amd.com/Documents/GCN_Architecture_whitepaper.pdf>`_
 
-  * :ref:`ROCm-MultiGPU`	
+  * :ref:`ROCm-MultiGPU`
diff --git a/Tutorial/caffe.rst b/Tutorial/caffe.rst
index 211cfe56..9fa281e0 100644
--- a/Tutorial/caffe.rst
+++ b/Tutorial/caffe.rst
@@ -30,38 +30,38 @@ Installing ROCm Debian packages:
 ::
 
   PKG_REPO="http://repo.radeon.com/rocm/apt/debian/"
-   
+
   wget -qO - $PKG_REPO/rocm.gpg.key | sudo apt-key add -
-  
+
   sudo sh -c "echo deb [arch=amd64] $PKG_REPO xenial main > /etc/apt/sources.list.d/rocm.list"
- 
+
   sudo apt-get update
-  
+
   sudo apt-get install rocm rocm-utils rocm-opencl rocm-opencl-dev rocm-profiler cxlactivitylogger
 
   echo 'export PATH=/opt/rocm/bin:$PATH' >> $HOME/.bashrc
-  
+
   echo 'export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH' >> $HOME/.bashrc
 
   source $HOME/.bashrc
-  
+
   sudo reboot
-  
+
 Then, verify the installation. Double-check your kernel (at a minimum, you should see "kfd" in the name)::
- 
+
    uname -r
-  
+
 In addition, check that you can run the simple HSA vector_copy sample application::
-  
+
   cd /opt/rocm/hsa/sample
   make
   ./vector_copy
-  
+
 Pre-requisites Installation
 ++++++++++++++++++++++++++++
 
 Install Caffe dependencies::
- 
+
  sudo apt-get install \
  	pkg-config \
  	protobuf-compiler \
@@ -78,24 +78,24 @@ Install Caffe dependencies::
  	libopencv-dev \
  	libfftw3-dev \
  	libelf-dev
- 
+
 
 Install the necessary ROCm compute libraries::
- 
+
  sudo apt-get install rocm-libs miopen-hip miopengemm
 
 hipCaffe Build Steps
 +++++++++++++++++++++
 Clone hipCaffe::
- 
- git clone https://github.com/ROCmSoftwarePlatform/hipCaffe.git 
- 
+
+ git clone https://github.com/ROCmSoftwarePlatform/hipCaffe.git
+
  cd hipCaffe
- 
+
 You may need to modify the Makefile.config file for your own installation. Then, build it::
- 
+
  cp ./Makefile.config.example ./Makefile.config
- make 
+ make
 
 To improve build time, consider invoking parallel make with the "-j$(nproc)" flag.
 
@@ -103,7 +103,7 @@ Unit Testing
 -------------
 
 Run the following commands to perform unit testing of different components of Caffe.
-:: 
+::
  make test
  ./build/test/test_all.testbin
 
@@ -114,7 +114,7 @@ MNIST training
 ++++++++++++++++
 
 Steps::
- 
+
     ./data/mnist/get_mnist.sh
     ./examples/mnist/create_mnist.sh
     ./examples/mnist/train_lenet.sh
@@ -123,7 +123,7 @@ CIFAR-10 training
 ++++++++++++++++++
 
 Steps::
- 
+
     ./data/cifar10/get_cifar10.sh
     ./examples/cifar10/create_cifar10.sh
     ./build/tools/caffe train --solver=examples/cifar10/cifar10_quick_solver.prototxt
@@ -142,7 +142,7 @@ Soumith's Convnet benchmarks
 Steps:
 
 ::
-  
+
   git clone https://github.com/soumith/convnet-benchmarks.git
   cd convnet-benchmarks/caffe
 
@@ -183,7 +183,7 @@ Sometimes when training with multiple GPUs, we hit this type of error signature:
      @           0x8015c3 caffe::Solver<>::Solve()
      @           0x71a277 caffe::P2PSync<>::Run()
      @           0x42dcbc train()
- 
+
 
 See this `comment <https://github.com/ROCmSoftwarePlatform/hipCaffe/issues/11#issuecomment-318518802>`_.
 
diff --git a/Tutorial/hipCaffe .rst b/Tutorial/hipCaffe .rst
index 025181a0..28e5efde 100644
--- a/Tutorial/hipCaffe .rst	
+++ b/Tutorial/hipCaffe .rst	
@@ -4,18 +4,18 @@
 hipCaffe Quickstart Guide
 ###########################
 
-In this quickstart guide, we’ll walk through the steps for ROCm installation. Then, we’ll run a few training and inference experiments and check their accuracy.
+In this quickstart guide, we'll walk through the steps for ROCm installation. Then, we'll run a few training and inference experiments and check their accuracy.
 
 Install ROCm
 -------------
-Here are the main ROCm components we’ll be using::
+Here are the main ROCm components we'll be using::
 
  sudo apt install rocm-dkms
  sudo apt-get install rocm-libs
  sudo apt-get install miopen-hip miopengemm
- 
+
 And some misc packages::
- 
+
  sudo apt-get install -y \
       g++-multilib \
       libunwind-dev \
@@ -28,65 +28,65 @@ And some misc packages::
       rpm \
       unzip \
       bc
- 
+
 Verify ROCm
 ------------
 Test a simple HIP sample::
- 
+
  cp -r /opt/rocm/hip/samples ~/hip-samples && cd ~/hip-samples/0_Intro/square/
- 
+
  make
- 
+
  ./square.out
-  
+
 Install hipCaffe
 ----------------
 Handle the Caffe dependencies first::
- 
+
  sudo apt-get install -y \
       pkg-config \
       protobuf-compiler \
       libprotobuf-dev \
       libleveldb-dev \
       libsnappy-dev \
-      libhdf5-serial-dev \ 
+      libhdf5-serial-dev \
       libatlas-base-dev \
       libboost-all-dev \
       libgflags-dev \
       libgoogle-glog-dev \
-      liblmdb-dev \ 
+      liblmdb-dev \
       python-numpy python-scipy python3-dev python-yaml python-pip \
       python-skimage python-opencv python-protobuf \
       libopencv-dev \
       libfftw3-dev \
       libelf-dev
- 
+
 Note that you might need minor changes to Makefile.config (system dependent)::
- 
+
  cd ~
- 
+
  git clone https://github.com/ROCmSoftwarePlatform/hipCaffe.git
- 
+
  cd hipCaffe
- 
+
  cp ./Makefile.config.example ./Makefile.config
- 
+
  make -j$(nproc)
- 
+
 
 Workloads
 -----------
 MNIST training
 +++++++++++++++
 
-Details on MNIST training can be found at this `link <https://github.com/BVLC/caffe/blob/master/examples/mnist/readme.md>`_. 
- 
+Details on MNIST training can be found at this `link <https://github.com/BVLC/caffe/blob/master/examples/mnist/readme.md>`_.
+
 Here are the basic instructions::
 
  ./data/mnist/get_mnist.sh
  ./examples/mnist/create_mnist.sh
  ./examples/mnist/train_lenet.sh
- 
+
 Expected result: >99% accuracy after 10000 iterations
 ::
 
@@ -104,7 +104,7 @@ Expected result: >99% accuracy after 10000 iterations
  I0717 21:06:58.701591  9965 solver.cpp:404]     Test net output #0: accuracy = 0.9917
  I0717 21:06:58.701642  9965 solver.cpp:404]     Test net output #1: loss = 0.0269806 (* 1 = 0.0269806 loss)
  I0717 21:06:58.701668  9965 solver.cpp:322] Optimization Done.
-  
+
 
 CIFAR-10 training
 ++++++++++++++++++
@@ -112,14 +112,14 @@ CIFAR-10 training
 Details on CIFAR-10 training can be found at this `link <https://github.com/BVLC/caffe/blob/master/examples/cifar10/readme.md>`_.
 
 Here are the basic instructions::
- 
+
  ./data/cifar10/get_cifar10.sh
  ./examples/cifar10/create_cifar10.sh
  ./build/tools/caffe train --solver=examples/cifar10/cifar10_quick_solver.prototxt
- 
+
 Expected result: >70% accuracy after 4000 iterations
 ::
- 
+
  I0727 18:29:35.248363    33 solver.cpp:279] Solving CIFAR10_quick
  I0727 18:29:35.248366    33 solver.cpp:280] Learning Rate Policy: fixed
  I0727 18:29:35.248883    33 solver.cpp:337] Iteration 0, Testing net (#0)
@@ -134,7 +134,7 @@ Expected result: >70% accuracy after 4000 iterations
  I0727 18:30:13.722070    33 solver.cpp:404]     Test net output #0: accuracy = 0.7124
  I0727 18:30:13.722090    33 solver.cpp:404]     Test net output #1: loss = 0.848089 (* 1 = 0.848089 loss)
  I0727 18:30:13.722095    33 solver.cpp:322] Optimization Done.
- 
+
 
 CaffeNet inference
 +++++++++++++++++++
@@ -142,20 +142,20 @@ CaffeNet inference
 Details on CaffeNet inference can be found at this `link <https://github.com/BVLC/caffe/blob/master/examples/cpp_classification/readme.md>`_.
 
 Here are the basic instructions::
- 
+
  ./data/ilsvrc12/get_ilsvrc_aux.sh
  ./scripts/download_model_binary.py models/bvlc_reference_caffenet
  ./build/examples/cpp_classification/classification.bin models/bvlc_reference_caffenet/deploy.prototxt models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel data/ilsvrc12/imagenet_mean.binaryproto data/ilsvrc12/synset_words.txt examples/images/cat.jpg
- 
+
 
 Expected result: (note the ordering and associated percentages)
 ::
- 
+
  ---------- Prediction for examples/images/cat.jpg ----------
  0.3134 - "n02123045 tabby, tabby cat"
  0.2380 - "n02123159 tiger cat"
  0.1235 - "n02124075 Egyptian cat"
  0.1003 - "n02119022 red fox, Vulpes vulpes"
  0.0715 - "n02127052 lynx, catamount"
- 
+
 
diff --git a/Tutorial/rocncloc.rst b/Tutorial/rocncloc.rst
index 81de8dc4..4ee137ec 100644
--- a/Tutorial/rocncloc.rst
+++ b/Tutorial/rocncloc.rst
@@ -11,21 +11,21 @@ ROCm With Harmony: Combining OpenCL, HCC, and HSA in a Single Program
 Introduction
 *************
 
-In a previous blog we discussed the different languages available on the ROCm platform.  Here we’ll show you how to combine several of these languages in a single program:
+In a previous blog we discussed the different languages available on the ROCm platform.  Here we'll show you how to combine several of these languages in a single program:
 
- * We’ll use an offline OpenCL™ compiler to compile the “BitonicSort” OpenCL kernel (from the AMD APP SDK) into a standard HSA code  	object (“hsaco”) format.
- * The host code will employ HCC’s hc dialect for device discovery (ie hc::accelerator and hc::accelerator_view) and memory 	     	management (hc::array)
- * The actual dispatch will use the low-level HSA Runtime calls.  Recall that ROCR is an implementation of the HSA Runtime with      	extensions for multi-GPU configurations.   We’ll show you how to extract HSA queue and agent structures from the HCC C++ ones, and 	 then use them to perform the kernel launch.
+ * We'll use an offline OpenCL(TM) compiler to compile the "BitonicSort" OpenCL kernel (from the AMD APP SDK) into a standard HSA code  	object ("hsaco") format.
+ * The host code will employ HCC's hc dialect for device discovery (ie hc::accelerator and hc::accelerator_view) and memory 	     	management (hc::array)
+ * The actual dispatch will use the low-level HSA Runtime calls.  Recall that ROCR is an implementation of the HSA Runtime with      	extensions for multi-GPU configurations.   We'll show you how to extract HSA queue and agent structures from the HCC C++ ones, and 	 then use them to perform the kernel launch.
 
 There are several reasons you might want to do something along these lines. First, many kernels exist in OpenCL and re-using this existing investment can save time.  The OpenCL kernel language is widely-used, and it enables programmers to use advanced GPU features including local memory, rich math functions, and vector operations.   But the OpenCL runtime can be verbose and the memory interface can be difficult to control and optimize. HCC provides the advantage of a full C++ runtime but also full control over the memory allocation and copies.  Using the techniques we'll show you here, you can employ OpenCL kernels without having to port the host runtime code to OpenCL. This approach offers a significant advantage for larger C++ programs that can use a few optimized OpenCL kernels while sticking with C++ kernels and features for the rest of the program.
 
 hsaco : The Common Currency
 ****************************
-Hsaco is informally pronounced “sock-o” (with a slight emphasis on the first letter to reflect the otherwise silent “h”).  It's a standard ELF file ;`ELF <https://en.wikipedia.org/wiki/Executable_and_Linkable_Format>`_ (“Executable and Linkable Format”) is a container format widely used in Linux to store object code, and the hsaco ELF container organization matches the one generated by the popular LLVM tool chain.  Hsaco stores the compiled GCN code in the .text section, it optionally contains debug information, and it defines symbols that allow the host code to find the kernel entrypoints and functions.  Like other ELF files, code objects can contain multiple kernels, functions, and data – so when using hsaco you will need to specify both the code object and the desired symbol.  Refer to the `detailed description <https://github.com/RadeonOpenCompute/ROCm-Docs>`_ of the hsaco format for more information. Many tools in AMD’s compiler chain generate and use the hsaco format including  OpenCL, HCC, HIP, the GCN assembler and the HSAIL Finalizer.  Kernel code contained in hsaco can be extracted and then launched onto the GPU.Additionally, the `dissembler tool <https://github.com/ROCm-Developer-Tools/LLVM-AMDGPU-Assembler-Extra>`_ can disassemble hsaco files so you can see what is going on inside the kernel.  In a future blog, we’ll talk about using the same techniques described here to assemble and then launch kernels written in GCN assembly.  Essentially, hsaco is the interchange format used to pass code between these different tools, and allows code written in different languages to be used together.
+Hsaco is informally pronounced "sock-o" (with a slight emphasis on the first letter to reflect the otherwise silent "h").  It's a standard ELF file ;`ELF <https://en.wikipedia.org/wiki/Executable_and_Linkable_Format>`_ ("Executable and Linkable Format") is a container format widely used in Linux to store object code, and the hsaco ELF container organization matches the one generated by the popular LLVM tool chain.  Hsaco stores the compiled GCN code in the .text section, it optionally contains debug information, and it defines symbols that allow the host code to find the kernel entrypoints and functions.  Like other ELF files, code objects can contain multiple kernels, functions, and data - so when using hsaco you will need to specify both the code object and the desired symbol.  Refer to the `detailed description <https://github.com/RadeonOpenCompute/ROCm-Docs>`_ of the hsaco format for more information. Many tools in AMD's compiler chain generate and use the hsaco format including  OpenCL, HCC, HIP, the GCN assembler and the HSAIL Finalizer.  Kernel code contained in hsaco can be extracted and then launched onto the GPU.Additionally, the `dissembler tool <https://github.com/ROCm-Developer-Tools/LLVM-AMDGPU-Assembler-Extra>`_ can disassemble hsaco files so you can see what is going on inside the kernel.  In a future blog, we'll talk about using the same techniques described here to assemble and then launch kernels written in GCN assembly.  Essentially, hsaco is the interchange format used to pass code between these different tools, and allows code written in different languages to be used together.
 
 Compiling an OpenCL Kernel into hsaco
 **************************************
-The Makefile shows the usage of the `CLOC <https://github.com/HSAFoundation/CLOC>`_ (CL Offline Compiler) tool to compile the CL kernel into the hsaco file.  Here’s the relevant call to CLOC: /opt/rocm/cloc/bin/cloc.sh BitonicSort_Kernels.cl -o BitonicSort_Kernels.hsaco
+The Makefile shows the usage of the `CLOC <https://github.com/HSAFoundation/CLOC>`_ (CL Offline Compiler) tool to compile the CL kernel into the hsaco file.  Here's the relevant call to CLOC: /opt/rocm/cloc/bin/cloc.sh BitonicSort_Kernels.cl -o BitonicSort_Kernels.hsaco
 
 Using hsaco:
 ************
@@ -34,7 +34,7 @@ This example shows two methods for accessing the hsaco data from the host applic
  * Use a separate file and load it using C++ file I/O code. See the load_hsa_from_file() command. This path is enabled when 	     	p_loadKernelFromFile=true.
  * Serialize the code into a global string and thus directly link the hsaco into the executable. This approach avoids the need to    	find the hsaco file at runtime.  This path is enabled when p_loadKernelFromFile=false.
 
-The “load_hsa_code_object” shows the use of the standard HSA Runtime API calls to load the code object into memory and extract the pointer to the BitonicSort kernel.  If we were working with an HSAIL or BRIG kernel we would first call the finalizer which would produce hsaco data, and the use these exact same finalizer APIs to load the hsaco into memory and find the desired symbols.    This is a powerful and extremely useful concept that allows applications using the HSA Runtime to support either:
+The "load_hsa_code_object" shows the use of the standard HSA Runtime API calls to load the code object into memory and extract the pointer to the BitonicSort kernel.  If we were working with an HSAIL or BRIG kernel we would first call the finalizer which would produce hsaco data, and the use these exact same finalizer APIs to load the hsaco into memory and find the desired symbols.    This is a powerful and extremely useful concept that allows applications using the HSA Runtime to support either:
 
   * An industry standard portable intermediate language (HSAIL/BRIG) that can be finalized to a vendor-specific binary, or
   * A standard ELF container that stores vendor-specific binary code (hsaco). This flavor supports vendor-specific ISA inside a     	standard container format, and still benefits from the standard HSA runtime API.  Effectively this enables use cases where apps 	and tools can use the HSA Runtime APIs without using HSAIL, and still retain source code portability.
@@ -46,7 +46,7 @@ The picture below shows the different steps in the code loading process, and in
 
 Making HCC Sing
 ******************
-The example uses the hc `C++ dialect <gpuopen.com/rocm-do-you-speaka-my-language/>`_ to select the default accelerator and queue.  To launch the hsaco file we’ve created, we need to make HCC reveal the details of the HSA data structure that live under the covers. Here’s the critical piece of code that shows how to get from the HCC world to the HSA world using “hc::accelerator_view::get_hsa_queue”:
+The example uses the hc `C++ dialect <gpuopen.com/rocm-do-you-speaka-my-language/>`_ to select the default accelerator and queue.  To launch the hsaco file we've created, we need to make HCC reveal the details of the HSA data structure that live under the covers. Here's the critical piece of code that shows how to get from the HCC world to the HSA world using "hc::accelerator_view::get_hsa_queue":
 
 ::
 
@@ -63,7 +63,7 @@ Now that we have an HSA queue we can use the low-level HSA runtime API to enqueu
 
 Extracting Data Pointers
 *************************
-The example under discussion uses hc::array<>to store the array of integers that are sorted.  The original OpenCL kernel of course knows nothing of the  hc::array<> data-type.  Here’s the OpenCL kernel signature:
+The example under discussion uses hc::array<>to store the array of integers that are sorted.  The original OpenCL kernel of course knows nothing of the  hc::array<> data-type.  Here's the OpenCL kernel signature:
 
 ::
 
@@ -76,11 +76,11 @@ When calling this kernel, the first parameter (theArray) is an 8-byte pointer.
 
   _inputAccPtr = _inputArray->;accelerator_pointer();
 
- 
+
 
 Our application is still responsible for ensuring that the data at this pointer is valid on the accelerator, before calling the kernel.   In this case, the application copies from host data (allocated with malloc) to the inputArray.
 
-The code also shows the use of hc’s accelerator memory interface to allocate and copy the data.  This is an alternative to using hc::array<>, and can be select by setting p_useHcArray=false in the top of the source code.  Here’s the relevant code snippet:
+The code also shows the use of hc's accelerator memory interface to allocate and copy the data.  This is an alternative to using hc::array<>, and can be select by setting p_useHcArray=false in the top of the source code.  Here's the relevant code snippet:
 
 ::
 
@@ -92,7 +92,7 @@ The code also shows the use of hc’s accelerator memory interface to allocate a
 
 We do not recommended usinge  hc::array_view<> with the direct hsaco code launching techniques we are discussing here.  hc::array_view<> is designed to automatically synchronize the data before and after parallel_for_each blocks are launched.  Direct launching with HSA runtime APIs will not automatically synchronize hc::array_view<>.
 
-Finally, HCC provides accessors that allow easy retrieval of the the HSA “regions” associated with an accelerator.  The HSA runtime API uses regions to specify where memory on an agent is located - for example coarse-grain device memory or fine-grain system memory.    When enumerating accelerators, HCC scans the supported regions for each underlying HSA agent and provides the following accessors:
+Finally, HCC provides accessors that allow easy retrieval of the the HSA "regions" associated with an accelerator.  The HSA runtime API uses regions to specify where memory on an agent is located - for example coarse-grain device memory or fine-grain system memory.    When enumerating accelerators, HCC scans the supported regions for each underlying HSA agent and provides the following accessors:
 
 ::
 
@@ -126,7 +126,7 @@ This example uses get_hsa_kernarg_region() to allocate memory for the kernel arg
 
   } ;
 
- 
+
 
   /*
 
@@ -144,7 +144,7 @@ This example uses get_hsa_kernarg_region() to allocate memory for the kernel arg
 
   assert(HSA_STATUS_SUCCESS == hsa_status);
 
- 
+
 
   /*
 
@@ -165,4 +165,4 @@ Summary
 We learned how to use offline compilation to convert an OpenCL kernel into a standard hsaco file and then employed the HSA Runtime API to launch that kernel from an HCC program.  Harmony!  In the future we'll look at how to optimize the HSA Runtime calls, and also how to use other tools to create hsaco files (such as the AMDGCN assembler).   Stay tuned.
 Reference:
 `GitHub Code for this example <https://github.com/RadeonOpenCompute/HCC-Example-Application/tree/master/BitonicSort-CL-from-HCC>`_
-https://en.wikipedia.org/wiki/Bitonic_sorter 
+https://en.wikipedia.org/wiki/Bitonic_sorter
diff --git a/_templates/breadcrumbs.html b/_templates/breadcrumbs.html
index f7561ce9..7cd66af3 100644
--- a/_templates/breadcrumbs.html
+++ b/_templates/breadcrumbs.html
@@ -40,7 +40,7 @@
             {% if display_github %}
             {% if check_meta and 'github_url' in meta %}
               <!-- User defined GitHub URL -->
-     
+
         `Documentation Feedback <mailto:rmalaval@amd.com?subject=Documentation%20Feedback>`_
              <a href="{{ meta['github_url'] }}" class="fa fa-github"> {{ _('Edit on GitHub') }}</a>
             {% else %}
diff --git a/amdstyles.css b/amdstyles.css
index 816ea836..02334498 100644
--- a/amdstyles.css
+++ b/amdstyles.css
@@ -5696,7 +5696,7 @@ fieldset[disabled] .navbar-inverse .btn-link:focus {
 }
 
 .breadcrumb > li + li:before {
-  content: "/ ";
+  content: "/ ";
   padding: 0 5px;
   color: #ccc;
 }
@@ -7638,7 +7638,7 @@ table.spec-table tr.detail-view td {
 }
 
 /**
- * @author: Dennis Hernández
+ * @author: Dennis Hernandez
  * @webSite: http://djhvscf.github.io/Blog
  * @version: v2.1.1
  */
@@ -13081,7 +13081,7 @@ readers do not read off random characters that represent icons */
 
 @font-face {
   font-family: "ProJP";
-  src: local("ヒラギノ角ゴ Pro");
+  src: local("?????? Pro");
   font-stretch: condensed;
   font-size: 10%;
 }
@@ -32096,7 +32096,7 @@ body.toolbar-fixed.toolbar-vertical.toolbar-tray-open.toolbar-fixed .toolbar-tra
 }
 
 .region-navigation #block-header-search-block .search-toggle-container .search-toggle .fa-search-toggle:before, .region-navigation #block-exposedformacquia-searchpage-2 .search-toggle-container .search-toggle .fa-search-toggle:before {
-  content: "";
+  content: "?";
 }
 
 .region-navigation #block-header-search-block .search-toggle-container .search-toggle .fa-search-toggle.active, .region-navigation #block-exposedformacquia-searchpage-2 .search-toggle-container .search-toggle .fa-search-toggle.active {
@@ -38783,7 +38783,7 @@ fieldset[disabled] .navbar-inverse .btn-link:focus {
 }
 
 .breadcrumb > li + li:before {
-  content: "/ ";
+  content: "/ ";
   padding: 0 5px;
   color: #ccc;
 }
@@ -40725,7 +40725,7 @@ table.spec-table tr.detail-view td {
 }
 
 /**
- * @author: Dennis Hernández
+ * @author: Dennis Hernandez
  * @webSite: http://djhvscf.github.io/Blog
  * @version: v2.1.1
  */
@@ -42039,7 +42039,7 @@ ul.field--name-field-game-information > li.field__item {
 }
 
 ul.field--name-field-game-information > li.field__item:before {
-  content: '✓';
+  content: '?';
   padding-right: 5px;
 }
 
@@ -43355,7 +43355,7 @@ article.embedded-entity span.image-title {
     width: 100%;
     margin-top: 40px;
   }
-  
+
   .page-node-type-product-page .product fieldset .fieldset-wrapper ul,
   .block-amd-support-product-spec .product fieldset .fieldset-wrapper ul, .page-node-type-product-page .product fieldset .fieldset-wrapper .field, .block-amd-support-product-spec .product fieldset .fieldset-wrapper .field {
     float: none;
diff --git a/cleanup_text.sh b/cleanup_text.sh
new file mode 100755
index 00000000..3649153d
--- /dev/null
+++ b/cleanup_text.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Script to clean up text files
+# Lee Killough
+# lee.killough@amd.com
+
+set -ex
+
+export PATH=/usr/bin:/bin
+
+# Go through the entire repository, excluding files normally excluded by Git
+git ls-files -z --exclude-standard | while read -rd '' file; do
+
+    # Operate only on regular files of MIME type text/*
+    if [[ -f "$file" && "$(file -b --mime-type "$file")" == text/* ]]; then
+
+	# Remove editor backup files ending in ~
+	if [[ "$file" = *~ ]]; then
+	    git rm "$file"
+	    continue
+        fi
+	
+	# Remove trailing whitespace at end of lines (also converts CR-LF to LF)
+	sed -i -e 's/[[:space:]]*$//' "$file"
+
+        # Add missing newline to end of file
+	sed -i -e '$a\' "$file"
+
+	# Convert UTF8 non-ASCII to ASCII
+	temp=$(mktemp)
+	iconv -s -f utf-8 -t ascii//TRANSLIT "$file" > "$temp"
+	chmod --reference="$file" "$temp"
+	mv -f "$temp" "$file"
+
+	# Add the file to the index if it has changed
+	git add -u "$file"
+    fi
+done
+
+cat<<EOF
+
+All of the text files in the repository have been cleaned up.
+
+Review the changes, and commit them if they are acceptable.
+EOF
diff --git a/conf.py b/conf.py
index 85e094ea..5c5b1f90 100644
--- a/conf.py
+++ b/conf.py
@@ -25,7 +25,7 @@
 # https://github.com/rtfd/readthedocs.org/issues/388
 on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
 if on_rtd:
-  from subprocess import call 
+  from subprocess import call
   call('./run_doxygen.sh;', shell=True)
 
 # -- General configuration ------------------------------------------------
@@ -61,7 +61,7 @@
 
 #Added from mj
 # Configuration for mathjax extension
-# 
+#
 # Set path for mathjax js to a https URL as sometimes the Breathe docs are displayed under https
 # and we can't load an http mathjax file from an https view of the docs. So we change to a https
 # mathjax file which we can load from http or https. We break the url over two lines.
@@ -160,7 +160,7 @@
 
 # The name of an image file (relative to this directory) to place at the top
 # of the sidebar.
-#html_logo = 
+#html_logo =
 
 # The name of an image file (within the static path) to use as favicon of the
 # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
@@ -272,7 +272,7 @@
 
 
 
- 
+
 # -- Options for manual page output ---------------------------------------
 
 # One entry per manual page. List of tuples
diff --git a/index.rst b/index.rst
index eb0ca882..674769ba 100644
--- a/index.rst
+++ b/index.rst
@@ -9,20 +9,20 @@
 Welcome to AMD ROCm Platform
 =============================
 
-We are excited to present ROCm, the first open-source HPC/Hyperscale-class platform for GPU computing that’s also programming-language independent. We are bringing the UNIX philosophy of choice, minimalism and modular software development to GPU computing. The new ROCm foundation lets you choose or even develop tools and a language run time for your application.
+We are excited to present ROCm, the first open-source HPC/Hyperscale-class platform for GPU computing that's also programming-language independent. We are bringing the UNIX philosophy of choice, minimalism and modular software development to GPU computing. The new ROCm foundation lets you choose or even develop tools and a language run time for your application.
 
 **ROCm is built for scale**; it supports multi-GPU computing in and out of server-node communication through RDMA. It also simplifies the stack when the driver directly incorporates RDMA peer-sync support.
 
 **ROCm has a rich system run time** with the critical features that large-scale application, compiler and language-run-time development requirements.
 
 
-Going to 11: Amping Up the Programming-Language Run-Time Foundation 
+Going to 11: Amping Up the Programming-Language Run-Time Foundation
 ####################################################################
-The ROCr System Runtime is language independent and makes heavy use of the Heterogeneous System Architecture (HSA) Runtime API. This approach provides a rich foundation to execute programming languages such as HCC C++ and HIP, the Khronos Group’s OpenCL, and Continuum’s Anaconda Python.
+The ROCr System Runtime is language independent and makes heavy use of the Heterogeneous System Architecture (HSA) Runtime API. This approach provides a rich foundation to execute programming languages such as HCC C++ and HIP, the Khronos Group's OpenCL, and Continuum's Anaconda Python.
 
 .. image:: ROCm_Stack.png
     :align: center
-    
+
 
 Important features include the following:
 
@@ -40,7 +40,7 @@ Important features include the following:
 
 .. image:: ROCm_Core_Stack.png
     :align: center
-    
+
 Solid Compilation Foundation and Language Support
 ####################################################
     * LLVM compiler foundation
@@ -54,7 +54,7 @@ The frontiers of what you can accomplish with ROCm are vast and uncharted. We lo
    :maxdepth: 2
    :hidden:
    :caption: Release Documentation
-  
+
    Release Notes
    Current_Release_Notes/Current-Release-Notes
    Installation_Guide/Installation-Guide
@@ -63,33 +63,33 @@ The frontiers of what you can accomplish with ROCm are vast and uncharted. We lo
    :maxdepth: 6
    :hidden:
    :caption:  Developer Documentation
-    
-  
+
+
    Programming_Guides/Programming-Guides
    ROCm_Compiler_SDK/ROCm-Compiler-SDK
    ROCm_System_Managment/ROCm-System-Managment
    ROCm_Virtualization_Containers/ROCm-Virtualization-&-Containers
    Other_Solutions/Other-Solutions
-   
+
 .. toctree::
    :maxdepth: 6
    :hidden:
-   :caption: Reference Documentation 
-   
+   :caption: Reference Documentation
+
    ROCm_Libraries/ROCm_Libraries
    ROCm_API_References/ROCm-API-References
    ROCm_Tools/ROCm-Tools
    GCN_ISA_Manuals/GCN-ISA-Manuals
    Deep_learning/Deep-learning
-  
+
 .. toctree::
    :maxdepth: 6
    :hidden:
-   :caption: Additional Documentation 
-   
+   :caption: Additional Documentation
+
    Remote_Device_Programming/Remote-Device-Programming
    Tutorial/Tutorial
    ROCm_Glossary/ROCm-Glossary
 
 
-   
+
diff --git a/outline.rst b/outline.rst
index ea7260a4..bfde7274 100644
--- a/outline.rst
+++ b/outline.rst
@@ -1,6 +1,6 @@
 .. _ROCm_Release_Notes:
 
-  
+
 1.0 Current Release Notes
 ==========================
 
@@ -17,7 +17,7 @@
 2.3 Installation Guide Fedora
 -----------------------------
 
- 
+
 .. _ROCm_Developer_Guides:
 
 3.0 Programing Guides
@@ -43,7 +43,7 @@
 3.1.3 Performance Guidelines
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-3.1.4 Language Extensions 
+3.1.4 Language Extensions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 3.1.5 Mathematical Functions
@@ -54,7 +54,7 @@
 
 3.1.6 Environment Variables
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  
+
 3.2 HC Best Practices
 ----------------------
 
@@ -102,7 +102,7 @@
 
 3.2.115 HCC compiler switches
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  
+
 3.3 HIP Programing Guide
 -------------------------
 
@@ -122,7 +122,7 @@
 3.3.3 Performance Guidelines
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-3.3.4 Language Extensions 
+3.3.4 Language Extensions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 3.3.5 Mathematical Functions
@@ -131,9 +131,9 @@
 3.3.6 Textures
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-3.3.7 Environment Variables 
+3.3.7 Environment Variables
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  
+
 3.4 HIP Best Practices
 ----------------------
 
@@ -181,7 +181,7 @@
 
 3.4.15 HCC compiler switches
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  
+
 3.5 OpenCL Programing Guide
 ----------------------------
 
@@ -201,7 +201,7 @@
 3.5.3 Performance Guidelines
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-3.5.4 Language Extensions 
+3.5.4 Language Extensions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 3.5.5 Mathematical Functions
@@ -267,16 +267,16 @@
 
 .. _GCN_ISA_Manuals:
 
-5.0 GCN ISA Manuals 
+5.0 GCN ISA Manuals
 ====================
 
 5.1 GCN 1.1 Hawaii
 ----------------
 
-5.2 GCN 2.0 Fiji and Polaris  
+5.2 GCN 2.0 Fiji and Polaris
 ---------------
 
-5.2 GCN - Vega  
+5.2 GCN - Vega
 ---------------
 
 5.4 Inline GCN ISA Assembly Guide
@@ -300,7 +300,7 @@
 6.5 HIP Math API
 ----------------------------
 
-6.6 Math Libarary API's 
+6.6 Math Libarary API's
 ----------------------------
 
 6.6.1 rocBLAS
@@ -327,7 +327,7 @@
 6.6.8 clRAND
 ~~~~~~~~~~~~~~~~~~
 
-6.7 Deep Learning API's 
+6.7 Deep Learning API's
 ------------------------
 
 6.7.1 MIOpen
@@ -408,7 +408,7 @@
   vi.     Architected Queuing Language Packets
   vii.    Memory
   viii.   Code Object Loading
-  ix.     Common definitions  
+  ix.     Common definitions
 
 8.4.4 Best Practices Mapping Programing Language to ROCr runtime
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -426,13 +426,13 @@
 9.2 Programing ROCm-SMI
 --------------------------
 
-9.3 SYSFS Interface 
+9.3 SYSFS Interface
 -----------------------
 
 
 .. _ROCm_Virtualization_Guide:
 
-10 ROCm Virtualization & Containers  
+10 ROCm Virtualization & Containers
 ==========================================
 
 10.1 KVM Passthrough
@@ -444,13 +444,13 @@
 
 .. _ROCm_Remote_Device_Programing:
 
-11 Remote Device Programing 
+11 Remote Device Programing
 ===========================
 
 11.1 ROCnRDMA
 ---------
 
-11.2 UCX 
+11.2 UCX
 ---------
 
 11.3 MPI