diff --git a/examples/hip/CopyVsZeroCopy/Makefile b/examples/hip/CopyVsZeroCopy/Makefile new file mode 100644 index 000000000..fa945a42f --- /dev/null +++ b/examples/hip/CopyVsZeroCopy/Makefile @@ -0,0 +1,156 @@ +#----------------------------------------------------------------------- +# +# Makefile: Cuda clang demo Makefile for both amdgcn and nvptx targets. +# amdgcn targets begin with gfx. nvptx targets begin with sm_ +# +# Run "make help" to see how to use this Makefile +# +#----------------------------------------------------------------------- +# MIT License +# Copyright (c) 2017 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, copy, +# modify, merge, publish, distribute, sublicense, and/or sell copies +# of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +TESTNAME =ep +FILETYPE =cpp + +UNAMEP = $(shell uname -m) +AOMP_CPUTARGET = $(UNAMEP)-pc-linux-gnu +ifeq ($(UNAMEP),ppc64le) + AOMP_CPUTARGET = ppc64le-linux-gnu +endif +ifeq ($(AOMP),) +# --- Standard Makefile check for AOMP installation --- +ifeq ("$(wildcard $(AOMP))","") + ifneq ($(AOMP),) + $(warning AOMP not found at $(AOMP)) + endif + AOMP = $(HOME)/rocm/aomp + ifeq ("$(wildcard $(AOMP))","") + $(warning AOMP not found at $(AOMP)) + AOMP = /usr/lib/aomp + ifeq ("$(wildcard $(AOMP))","") + $(warning AOMP not found at $(AOMP)) + $(error Please install AOMP or correctly set env-var AOMP) + endif + endif +endif +# --- End Standard Makefile check for AOMP installation --- +endif +ifeq ($(CUDA),) +CUDA = /usr/local/cuda +endif +ifeq ($(AOMP_GPU),) +INSTALLED_GPU = $(shell $(AOMP)/bin/mygpu -d gfx900) # Default AOMP_GPU is gfx900 which is vega +AOMP_GPU ?= $(INSTALLED_GPU) +endif +ifeq ($(TARGETS),) +TARGETS =--offload-arch=$(AOMP_GPU)$(AOMP_TARGET_FEATURES) +endif + +CC =$(AOMP)/bin/clang++ + +# compiler automatically adds "libdevice// to -L opts +LFLAGS =-L$(AOMP)/lib + +# Add cudart only if we have an Nvidia sm_ target +ifeq (sm_,$(findstring sm_,$(TARGETS))) + LFLAGS +=-L$(CUDA)/targets/$(UNAMEP)-linux/lib -lcudart -Wl,-rpath,$(CUDA)/targets/$(UNAMEP)-linux/lib -std=c++11 + CFLAGS +=-x cuda -I$(CUDA)/include +else + AOMPHIP ?= $(AOMP) + PFILE = $(AOMPHIP)/bin/hipcc + ifeq ("$(wildcard $(PFILE))","") + AOMPHIP = $(AOMP)/.. + PFILE = $(AOMPHIP)/bin/hipcc + ifeq ("$(wildcard $(PFILE))","") + AOMPHIP = $(AOMP)/../.. + endif + endif + + VERS = $(shell $(AOMP)/bin/clang --version | grep -oP '(?<=clang version )[0-9.]+') + ifeq ($(shell expr $(VERS) \>= 12.0), 1) + RPTH = -Wl,-rpath,$(AOMPHIP)/lib + endif + HIPLIBS = -L $(AOMPHIP)/hip -L $(AOMPHIP)/lib $(RPTH) + CFLAGS = -x hip -std=c++11 $(HIPLIBS) -lamdhip64 -mcode-object-version=4 +endif + +# ----- Demo compile and link in one step, no object code saved +$(TESTNAME): $(TESTNAME).$(FILETYPE) + $(CC) $(CFLAGS) $(TARGETS) $(LFLAGS) $^ -o $@ + +run: $(TESTNAME) + ./$(TESTNAME) + +# ---- Demo compile and link in two steps, object saved +$(TESTNAME).o: $(TESTNAME).$(FILETYPE) + $(CC) -c $(CFLAGS) $(TARGETS) $^ + +obin: $(TESTNAME).o + $(CC) $(LFLAGS) $^ -o obin + +run_obin: obin + ./obin + +# ---- Demo compile to intermediates LLVMIR or assembly +$(TESTNAME).ll: $(TESTNAME).$(FILETYPE) + $(CC) -c -S -emit-llvm $(CFLAGS) $(TARGETS) $^ + +$(TESTNAME).s: $(TESTNAME).$(FILETYPE) + $(CC) -c -S $(CFLAGS) $(TARGETS) $^ + +help: + @echo + @echo "Makefile Help:" + @echo " Source: $(TESTNAME).$(FILETYPE)" + @echo " Compiler: $(CC)" + @echo " Compiler flags: $(CFLAGS)" + @echo + @echo "Avalable Targets:" + @echo " make // build binary $(TESTNAME)" + @echo " make run // run $(TESTNAME)" + @echo " make $(TESTNAME).o // compile, be, & assemble : -c" + @echo " make obin // link step only" + @echo " make run_obin // run obin " + @echo " make $(TESTNAME).s // compile & backend steps : -c -S" + @echo " make $(TESTNAME).ll // compile step only : -c -S -emit-llvm" + @echo " make clean // cleanup files" + @echo " make help // this help" + @echo + @echo "Environment Variables:" + @echo " AOMP default: $(HOME)/rocm/aomp value: $(AOMP)" + @echo " AOMP_GPU default: gfx900 value: $(AOMP_GPU)" + @echo " CUDA default: /usr/local/cuda value: $(CUDA)" + @echo " TARGETS default: --offload-arch=$(AOMP_GPU)" + @echo " value: $(TARGETS)" + @echo + @echo "Link Flags:" + @echo " Link flags: $(LFLAGS)" + @echo + +# Cleanup anything this makefile can create +clean: + @[ -f ./$(TESTNAME) ] && rm ./$(TESTNAME) ; true + @[ -f ./obin ] && rm ./obin ; true + @[ -f ./$(TESTNAME).ll ] && rm *.ll ; true + @[ -f ./$(TESTNAME).o ] && rm $(TESTNAME).o ; true + @[ -f ./$(TESTNAME).s ] && rm *.s ; true diff --git a/examples/hip/CopyVsZeroCopy/README.md b/examples/hip/CopyVsZeroCopy/README.md new file mode 100644 index 000000000..92a791239 --- /dev/null +++ b/examples/hip/CopyVsZeroCopy/README.md @@ -0,0 +1,11 @@ +CopyVsZeroCopy - Demonstrate performance difference on MI300A between Copy and Zero-Copy configurations. +======================================================= +This test is used to monitor performance difference between OpenMP's matching Copy and Zero-Copy configurations when +programmed in HIP. + +To build in Copy configuration, use: +HSA_XNACK=0 make run + +To build in Zero-Copy configuration, use +HSA_XNACK=1 make run + diff --git a/examples/hip/CopyVsZeroCopy/ep.cpp b/examples/hip/CopyVsZeroCopy/ep.cpp new file mode 100644 index 000000000..43cf507a0 --- /dev/null +++ b/examples/hip/CopyVsZeroCopy/ep.cpp @@ -0,0 +1,107 @@ +// MIT License +// +// Copyright (c) 2017 Advanced Micro Devices, Inc. All Rights Reserved. +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies +// of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// This program replicates OpenMP behavior for two (extremely reduced) +// kernels of the benchmark SPECaccel 2023 452.ep, emulating OpenMP's +// copy and zero-copy runtime behaviors. + + +#include +#include +#include +#include +#include + +#include "hip/hip_runtime.h" + +__global__ void init_xx(double *xx, int length) { + int i = threadIdx.x + blockIdx.x*blockDim.x; + if (i > length) return; + xx[i] = 1.0; +} + +__global__ void inc_xx(double *xx, int blksize, int nk) { + int k = threadIdx.x + blockIdx.x*blockDim.x; + if (k >= blksize) { + return; + } + for(int i=0; i<2*nk; i++) { + xx[k*2*nk + i] += 1.0; + } + return; +} + +int main() { + int blksize = 15000; + int nk = 65536; + double *xx = (double *)malloc(blksize*2*nk*sizeof(double)); + int m = 40; + int mk = 16; + int mm = m - mk; + int np = (1 << mm); + int numblks = ceil( (double)np / (double) blksize); + hipError_t err; + + printf("numblks = %d\n", numblks); + + char *HSA_XNACK_Env = getenv("HSA_XNACK"); + bool isXnackEnabled = false; + if (HSA_XNACK_Env) { + int HSA_XNACK_Val = atoi(HSA_XNACK_Env); + isXnackEnabled = (HSA_XNACK_Val > 0) ? true : false; + } + + double *d_xx = nullptr; + //#pragma omp target enter data map(alloc:xx[0:blksize*2*nk]) + if (!isXnackEnabled) { // Copy + printf("OpenMP Copy configuration\n"); + err = hipMalloc(&d_xx, blksize*2*nk*sizeof(double)); + if (err != HIP_SUCCESS) { + printf("Cannot allocate device memory\n"); + return 0; + } + //hipMemcpy(d_xx, xx, blksize*2*nk*sizeof(double), hipMemcpyHostToDevice); + } else { + printf("OpenMP Zero-Copy configuration\n"); + d_xx = xx; // zero-copy + } + + for (int blk=0; blk < 10; ++blk) { + printf("blk=%d\n", blk); + // #pragma omp target teams loop collapse(2) + // for(int k=0; k>>(d_xx, blksize*2*nk); + hipDeviceSynchronize(); + // #pragma omp target teams loop + // for (int k = 0; k < blksize; k++) + // for(int i=0; i<2*nk; i++) + // xx[k*2*nk + i] += 1.0; + inc_xx<<<938, 16, 0>>>(d_xx, blksize, nk); + hipDeviceSynchronize(); + } + + return 0; +}