FIX proper dimensions grid(height,depth),block(width)

plattenschieber · Mar 17, 2015 · 324a78f · 324a78f
1 parent e0c1bf4
commit 324a78f
Showing 1 changed file with 21 additions and 21 deletions.
diff --git a/hscgv_uebung_4/lbm.cu b/hscgv_uebung_4/lbm.cu
@@ -38,35 +38,35 @@ __constant__ float d_omega;
 size_t __device__ index(int i, int j, int k) {
 
     if(PeriodicBoundaries) {
-        i = (i+gridDim.x)%gridDim.x;
-        j = (j+blockDim.x)%blockDim.x;
-        k = (k+blockDim.y)%blockDim.y;
+        i = (i+blockDim.x)%blockDim.x;
+        j = (j+gridDim.x)%gridDim.x;
+        k = (k+gridDim.y)%gridDim.y;
     }
-    return i + blockDim.x*(j + blockDim.y*size_t(k));
+    return i + blockDim.x*(j + gridDim.x*size_t(k));
 }
 
 size_t __device__ index(int i, int j, int k, int l) {
     if(PeriodicBoundaries) {
-        i = (i+gridDim.x)%gridDim.x;
-        j = (j+blockDim.x)%blockDim.x;
-        k = (k+blockDim.y)%blockDim.y;
+        i = (i+blockDim.x)%blockDim.x;
+        j = (j+gridDim.x)%gridDim.x;
+        k = (k+gridDim.y)%gridDim.y;
     }
 #ifdef INNER_INDEX_DISTRIBUTION
-    return l + Q*(i + blockDim.x*(j + blockDim.y*size_t(k)));
+    return l + Q*(i + blockDim.x*(j + gridDim.x*size_t(k)));
 #else
-    return i + blockDim.x*(j + blockDim.y*(size_t(k) + gridDim.x*l));
+    return i + blockDim.x*(j + gridDim.x*(size_t(k) + gridDim.y*l));
 #endif
 }
 
 __global__ void collideCudaKernel(float *d_cellsCur, char *d_flags, float3 *d_velocity) {
     // get the current thread position
     int i = threadIdx.x;
-    int j = threadIdx.y;
-    int k = blockIdx.x;
+    int j = blockIdx.x;
+    int k = blockIdx.y;
 
     // in case we have no periodic boundaries, the threads on the edges don't have anything to do
     if (!PeriodicBoundaries) {
-        if (i==0 || i==blockDim.x-1 || j==0 || j==blockDim.y-1 || k==0 || k==gridDim.x-1)
+        if (i==0 || i==blockDim.x-1 || j==0 || j==gridDim.x-1 || k==0 || k==gridDim.y-1)
             return;
     }
 
@@ -109,12 +109,12 @@ __global__ void collideCudaKernel(float *d_cellsCur, char *d_flags, float3 *d_ve
 __global__ void streamCudaKernel(float *d_cellsCur, float *d_cellsLast, char *d_flags) {
     // get the current thread position
     int i = threadIdx.x;
-    int j = threadIdx.y;
-    int k = blockIdx.x;
+    int j = blockIdx.x;
+    int k = blockIdx.y;
 
     // in case we have no periodic boundaries, the threads on the edges don't have anything to do
     if (!PeriodicBoundaries) {
-        if (i==0 || i==blockDim.x-1 || j==0 || j==blockDim.y-1 || k==0 || k==gridDim.x-1)
+        if (i==0 || i==blockDim.x-1 || j==0 || j==gridDim.x-1 || k==0 || k==gridDim.y-1)
             return;
     }
 
@@ -137,8 +137,8 @@ __global__ void streamCudaKernel(float *d_cellsCur, float *d_cellsLast, char *d_
 __global__ void analyzeCudaKernel(float *d_cellsCur, char *d_flags, float *d_density, float3 *d_u, float3 *d_velocity) {
     // get the current thread position
     int i = threadIdx.x;
-    int j = threadIdx.y;
-    int k = blockIdx.x;
+    int j = blockIdx.x;
+    int k = blockIdx.y;
 
     // compute density and velocity in cell
     float density = 0.f;
@@ -182,25 +182,25 @@ void LBMD3Q19::initializeCuda() {
 
 //! collide implementation with CUDA
 void LBMD3Q19::collideCuda() {
-    collideCudaKernel<<<dim3(m_width),dim3(m_height,m_depth)>>>(d_cells[m_current], d_flags, d_velocity);
+    collideCudaKernel<<<dim3(m_height, m_depth),dim3(m_width)>>>(d_cells[m_current], d_flags, d_velocity);
 }
 
 //! streaming with CUDA
 void LBMD3Q19::streamCuda() {
-    streamCudaKernel<<<dim3(m_width),dim3(m_height,m_depth)>>>(d_cells[m_current], d_cells[!m_current], d_flags);
+    streamCudaKernel<<<dim3(m_height, m_depth),dim3(m_width)>>>(d_cells[m_current], d_cells[!m_current], d_flags);
 }
 
 //! compute densities and velocities with CUDA
 void LBMD3Q19::analyzeCuda() {
-    analyzeCudaKernel<<<dim3(m_width),dim3(m_height,m_depth)>>>(d_cells[m_current], d_flags, d_density, d_u, d_velocity);
+    analyzeCudaKernel<<<dim3(m_height, m_depth),dim3(m_width)>>>(d_cells[m_current], d_flags, d_density, d_u, d_velocity);
     // we need to copy back the analyzed data to the host
     gpuErrchk (cudaMemcpy(m_u, d_u, sizeof(float3) * m_width * m_height * m_depth, cudaMemcpyDeviceToHost));
     gpuErrchk (cudaMemcpy(m_density, d_density, sizeof(float) * m_width * m_height * m_depth, cudaMemcpyDeviceToHost));
 }
 
 //! compute minimum and maximum density and velocity with CUDA
 void LBMD3Q19::minMaxCuda() {
-    minMaxCudaKernel<<<dim3(m_width),dim3(m_height,m_depth)>>>();
+    minMaxCudaKernel<<<dim3(m_height, m_depth),dim3(m_width)>>>();
 }
 
 //! very dumb function that copies cells back to host