Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issue with OMP threading #1188

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 12 additions & 16 deletions src/parcsr_ls/par_lr_interp.c
Original file line number Diff line number Diff line change
Expand Up @@ -1120,17 +1120,16 @@ hypre_BoomerAMGBuildExtPIInterpHost(hypre_ParCSRMatrix *A,
HYPRE_Int strong_f_marker;

/* Loop variables */
/*HYPRE_Int index;*/
HYPRE_Int start_indexing = 0;
HYPRE_Int i, i1, i2, jj, kk, k1, jj1;
HYPRE_BigInt big_k1;

/* Threading variables */
HYPRE_Int my_thread_num, num_threads, start, stop;
HYPRE_Int * max_num_threads = hypre_CTAlloc(HYPRE_Int, 1, HYPRE_MEMORY_HOST);
HYPRE_Int * diag_offset;
HYPRE_Int * fine_to_coarse_offset;
HYPRE_Int * offd_offset;
HYPRE_Int num_threads = hypre_NumOptimalThreads(n_fine);
HYPRE_Int my_thread_num, start, stop;
HYPRE_Int *diag_offset;
HYPRE_Int *fine_to_coarse_offset;
HYPRE_Int *offd_offset;

/* Definitions */
HYPRE_Real zero = 0.0;
Expand Down Expand Up @@ -1209,11 +1208,10 @@ hypre_BoomerAMGBuildExtPIInterpHost(hypre_ParCSRMatrix *A,
/*-----------------------------------------------------------------------
* Initialize threading variables
*-----------------------------------------------------------------------*/
max_num_threads[0] = hypre_NumThreads();
diag_offset = hypre_CTAlloc(HYPRE_Int, max_num_threads[0], HYPRE_MEMORY_HOST);
fine_to_coarse_offset = hypre_CTAlloc(HYPRE_Int, max_num_threads[0], HYPRE_MEMORY_HOST);
offd_offset = hypre_CTAlloc(HYPRE_Int, max_num_threads[0], HYPRE_MEMORY_HOST);
for (i = 0; i < max_num_threads[0]; i++)
diag_offset = hypre_CTAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST);
fine_to_coarse_offset = hypre_CTAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST);
offd_offset = hypre_CTAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST);
for (i = 0; i < num_threads; i++)
{
diag_offset[i] = 0;
fine_to_coarse_offset[i] = 0;
Expand All @@ -1224,7 +1222,7 @@ hypre_BoomerAMGBuildExtPIInterpHost(hypre_ParCSRMatrix *A,
* Loop over fine grid.
*-----------------------------------------------------------------------*/
#ifdef HYPRE_USING_OPENMP
#pragma omp parallel private(i,my_thread_num,num_threads,start,stop,coarse_counter,jj_counter,jj_counter_offd, P_marker, P_marker_offd,jj,kk,i1,k1,loc_col,jj_begin_row,jj_begin_row_offd,jj_end_row,jj_end_row_offd,diagonal,sum,sgn,jj1,i2,distribute,strong_f_marker, big_k1)
#pragma omp parallel num_threads(num_threads) private(i,my_thread_num,start,stop,coarse_counter,jj_counter,jj_counter_offd, P_marker, P_marker_offd,jj,kk,i1,k1,loc_col,jj_begin_row,jj_begin_row_offd,jj_end_row,jj_end_row_offd,diagonal,sum,sgn,jj1,i2,distribute,strong_f_marker, big_k1)
#endif
{

Expand Down Expand Up @@ -1253,20 +1251,19 @@ hypre_BoomerAMGBuildExtPIInterpHost(hypre_ParCSRMatrix *A,
jj_counter_offd = start_indexing;
if (n_fine)
{
P_marker = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST);
P_marker = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST);
for (i = 0; i < n_fine; i++)
{ P_marker[i] = -1; }
}
if (full_off_procNodes)
{
P_marker_offd = hypre_CTAlloc(HYPRE_Int, full_off_procNodes, HYPRE_MEMORY_HOST);
P_marker_offd = hypre_CTAlloc(HYPRE_Int, full_off_procNodes, HYPRE_MEMORY_HOST);
for (i = 0; i < full_off_procNodes; i++)
{ P_marker_offd[i] = -1;}
}

/* this thread's row range */
my_thread_num = hypre_GetThreadNum();
num_threads = hypre_NumActiveThreads();
start = (n_fine / num_threads) * my_thread_num;
if (my_thread_num == num_threads - 1)
{ stop = n_fine; }
Expand Down Expand Up @@ -1895,7 +1892,6 @@ hypre_BoomerAMGBuildExtPIInterpHost(hypre_ParCSRMatrix *A,
*P_ptr = P;

/* Deallocate memory */
hypre_TFree(max_num_threads, HYPRE_MEMORY_HOST);
hypre_TFree(fine_to_coarse, HYPRE_MEMORY_HOST);
hypre_TFree(diag_offset, HYPRE_MEMORY_HOST);
hypre_TFree(offd_offset, HYPRE_MEMORY_HOST);
Expand Down
125 changes: 70 additions & 55 deletions src/seq_mv/csr_matop.c
Original file line number Diff line number Diff line change
Expand Up @@ -333,13 +333,7 @@ hypre_CSRMatrixAddSecondPass( HYPRE_Int firstrow,
}

/*--------------------------------------------------------------------------
* hypre_CSRMatrixAdd:
*
* Adds two CSR Matrices A and B and returns a CSR Matrix C = alpha*A + beta*B;
*
* Note: The routine does not check for 0-elements which might be generated
* through cancellation of elements in A and B or already contained
* in A and B. To remove those, use hypre_CSRMatrixDeleteZeros
* hypre_CSRMatrixAddHost
*--------------------------------------------------------------------------*/

hypre_CSRMatrix*
Expand All @@ -366,6 +360,7 @@ hypre_CSRMatrixAddHost ( HYPRE_Complex alpha,
HYPRE_Int *rownnz_C;
HYPRE_Int nnzrows_C;

HYPRE_Int num_threads;
HYPRE_Int *twspace;

HYPRE_MemoryLocation memory_location_A = hypre_CSRMatrixMemoryLocation(A);
Expand All @@ -387,12 +382,7 @@ hypre_CSRMatrixAddHost ( HYPRE_Complex alpha,
return NULL;
}

/* Allocate memory */
twspace = hypre_TAlloc(HYPRE_Int, hypre_NumThreads(), HYPRE_MEMORY_HOST);
C_i = hypre_CTAlloc(HYPRE_Int, nrows_A + 1, memory_location_C);

/* Set nonzero rows data of diag_C */
nnzrows_C = nrows_A;
if ((nnzrows_A < nrows_A) && (nnzrows_B < nrows_B))
{
hypre_IntArray arr_A;
Expand All @@ -412,17 +402,23 @@ hypre_CSRMatrixAddHost ( HYPRE_Complex alpha,
}
else
{
rownnz_C = NULL;
nnzrows_C = nrows_A;
rownnz_C = NULL;
}

/* Allocate memory */
num_threads = hypre_NumOptimalThreads(nnzrows_C);
twspace = hypre_TAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST);
C_i = hypre_CTAlloc(HYPRE_Int, nrows_A + 1, memory_location_C);

#ifdef HYPRE_USING_OPENMP
#pragma omp parallel
#pragma omp parallel num_threads(num_threads)
#endif
{
HYPRE_Int ns, ne;
HYPRE_Int *marker = NULL;

hypre_partition1D(nnzrows_C, hypre_NumActiveThreads(), hypre_GetThreadNum(), &ns, &ne);
hypre_partition1D(nnzrows_C, num_threads, hypre_GetThreadNum(), &ns, &ne);

marker = hypre_CTAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_HOST);

Expand All @@ -442,6 +438,16 @@ hypre_CSRMatrixAddHost ( HYPRE_Complex alpha,
return C;
}

/*--------------------------------------------------------------------------
* hypre_CSRMatrixAdd
*
* Adds two CSR Matrices A and B and returns a CSR Matrix C = alpha*A + beta*B;
*
* Note: The routine does not check for 0-elements which might be generated
* through cancellation of elements in A and B or already contained
* in A and B. To remove those, use hypre_CSRMatrixDeleteZeros
*--------------------------------------------------------------------------*/

hypre_CSRMatrix*
hypre_CSRMatrixAdd( HYPRE_Complex alpha,
hypre_CSRMatrix *A,
Expand Down Expand Up @@ -503,6 +509,7 @@ hypre_CSRMatrixBigAdd( hypre_CSRMatrix *A,
HYPRE_Int *C_i;
HYPRE_BigInt *C_j;
HYPRE_Int *twspace;
HYPRE_Int num_threads = hypre_NumOptimalThreads(nrows_A);

HYPRE_MemoryLocation memory_location_A = hypre_CSRMatrixMemoryLocation(A);
HYPRE_MemoryLocation memory_location_B = hypre_CSRMatrixMemoryLocation(B);
Expand All @@ -524,22 +531,21 @@ hypre_CSRMatrixBigAdd( hypre_CSRMatrix *A,
}

/* Allocate memory */
twspace = hypre_TAlloc(HYPRE_Int, hypre_NumThreads(), HYPRE_MEMORY_HOST);
twspace = hypre_TAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST);
C_i = hypre_CTAlloc(HYPRE_Int, nrows_A + 1, memory_location_C);

#ifdef HYPRE_USING_OPENMP
#pragma omp parallel
#pragma omp parallel num_threads(num_threads)
#endif
{
HYPRE_Int ia, ib, ic, num_nonzeros;
HYPRE_Int ns, ne, pos;
HYPRE_BigInt jcol;
HYPRE_Int ii, num_threads;
HYPRE_Int ii;
HYPRE_Int jj;
HYPRE_Int *marker = NULL;

ii = hypre_GetThreadNum();
num_threads = hypre_NumActiveThreads();
hypre_partition1D(nrows_A, num_threads, ii, &ns, &ne);

marker = hypre_CTAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_HOST);
Expand Down Expand Up @@ -694,6 +700,7 @@ hypre_CSRMatrixMultiplyHost( hypre_CSRMatrix *A,
HYPRE_Complex a_entry, b_entry;
HYPRE_Int allsquare = 0;
HYPRE_Int *twspace;
HYPRE_Int num_threads = hypre_NumOptimalThreads(nnzrows_A);

/* RL: TODO cannot guarantee, maybe should never assert
hypre_assert(memory_location_A == memory_location_B);
Expand Down Expand Up @@ -726,20 +733,18 @@ hypre_CSRMatrixMultiplyHost( hypre_CSRMatrix *A,
}

/* Allocate memory */
twspace = hypre_TAlloc(HYPRE_Int, hypre_NumThreads(), HYPRE_MEMORY_HOST);
twspace = hypre_TAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST);
C_i = hypre_CTAlloc(HYPRE_Int, nrows_A + 1, memory_location_C);

#ifdef HYPRE_USING_OPENMP
#pragma omp parallel private(ia, ib, ic, ja, jb, num_nonzeros, counter, a_entry, b_entry)
#pragma omp parallel private(ia, ib, ic, ja, jb, num_nonzeros, counter, a_entry, b_entry) num_threads(num_threads)
#endif
{
HYPRE_Int *B_marker = NULL;
HYPRE_Int ns, ne, ii, jj;
HYPRE_Int num_threads;
HYPRE_Int i1, iic;

ii = hypre_GetThreadNum();
num_threads = hypre_NumActiveThreads();
hypre_partition1D(nnzrows_A, num_threads, ii, &ns, &ne);

B_marker = hypre_CTAlloc(HYPRE_Int, ncols_B, HYPRE_MEMORY_HOST);
Expand Down Expand Up @@ -926,6 +931,16 @@ hypre_CSRMatrixMultiplyHost( hypre_CSRMatrix *A,
return C;
}

/*--------------------------------------------------------------------------
* hypre_CSRMatrixMultiply
*
* Multiplies two CSR Matrices A and B and returns a CSR Matrix C;
*
* Note: The routine does not check for 0-elements which might be generated
* through cancellation of elements in A and B or already contained
* in A and B. To remove those, use hypre_CSRMatrixDeleteZeros
*--------------------------------------------------------------------------*/

hypre_CSRMatrix*
hypre_CSRMatrixMultiply( hypre_CSRMatrix *A,
hypre_CSRMatrix *B)
Expand Down Expand Up @@ -1061,6 +1076,8 @@ hypre_CSRMatrixTransposeHost(hypre_CSRMatrix *A,
HYPRE_Int num_cols_AT;
HYPRE_Int num_nnzs_AT;

HYPRE_Int *bucket;
HYPRE_Int num_threads;
HYPRE_Int max_col;
HYPRE_Int i, j;

Expand Down Expand Up @@ -1119,14 +1136,14 @@ hypre_CSRMatrixTransposeHost(hypre_CSRMatrix *A,
/*-----------------------------------------------------------------
* Parallel count sort
*-----------------------------------------------------------------*/
HYPRE_Int *bucket = hypre_CTAlloc(HYPRE_Int, (num_cols_A + 1) * hypre_NumThreads(),
HYPRE_MEMORY_HOST);
num_threads = hypre_NumOptimalThreads(nnzrows_A);
bucket = hypre_CTAlloc(HYPRE_Int, (num_cols_A + 1) * num_threads, HYPRE_MEMORY_HOST);

#ifdef HYPRE_USING_OPENMP
#pragma omp parallel
#pragma omp parallel num_threads(num_threads)
#endif
{
HYPRE_Int ii, num_threads, ns, ne;
HYPRE_Int ii, ns, ne;
HYPRE_Int i, j, j0, j1, ir;
HYPRE_Int idx, offset;
HYPRE_Int transpose_i;
Expand All @@ -1136,7 +1153,6 @@ hypre_CSRMatrixTransposeHost(hypre_CSRMatrix *A,
HYPRE_Int transpose_j1;

ii = hypre_GetThreadNum();
num_threads = hypre_NumActiveThreads();
hypre_partition1D(nnzrows_A, num_threads, ii, &ns, &ne);

/*-----------------------------------------------------------------
Expand Down Expand Up @@ -1252,8 +1268,8 @@ hypre_CSRMatrixTransposeHost(hypre_CSRMatrix *A,
} /* end parallel region */

hypre_CSRMatrixI(*AT) = hypre_TAlloc(HYPRE_Int, num_cols_A + 1, memory_location);
hypre_TMemcpy(hypre_CSRMatrixI(*AT), bucket, HYPRE_Int, num_cols_A + 1, memory_location,
HYPRE_MEMORY_HOST);
hypre_TMemcpy(hypre_CSRMatrixI(*AT), bucket, HYPRE_Int, num_cols_A + 1,
memory_location, HYPRE_MEMORY_HOST);
hypre_CSRMatrixI(*AT)[num_cols_A] = num_nnzs_A;
hypre_TFree(bucket, HYPRE_MEMORY_HOST);

Expand Down Expand Up @@ -1313,47 +1329,46 @@ hypre_CSRMatrixSplit(hypre_CSRMatrix *Bs_ext,
hypre_CSRMatrix **Bext_diag_ptr,
hypre_CSRMatrix **Bext_offd_ptr)
{
HYPRE_Complex *Bs_ext_data = hypre_CSRMatrixData(Bs_ext);
HYPRE_Int *Bs_ext_i = hypre_CSRMatrixI(Bs_ext);
HYPRE_BigInt *Bs_ext_j = hypre_CSRMatrixBigJ(Bs_ext);
HYPRE_Int num_rows_Bext = hypre_CSRMatrixNumRows(Bs_ext);
HYPRE_Complex *Bs_ext_data = hypre_CSRMatrixData(Bs_ext);
HYPRE_Int *Bs_ext_i = hypre_CSRMatrixI(Bs_ext);
HYPRE_BigInt *Bs_ext_j = hypre_CSRMatrixBigJ(Bs_ext);
HYPRE_Int num_rows_Bext = hypre_CSRMatrixNumRows(Bs_ext);
HYPRE_Int B_ext_diag_size = 0;
HYPRE_Int B_ext_offd_size = 0;
HYPRE_Int *B_ext_diag_i = NULL;
HYPRE_Int *B_ext_diag_j = NULL;
HYPRE_Int *B_ext_diag_i = NULL;
HYPRE_Int *B_ext_diag_j = NULL;
HYPRE_Complex *B_ext_diag_data = NULL;
HYPRE_Int *B_ext_offd_i = NULL;
HYPRE_Int *B_ext_offd_j = NULL;
HYPRE_Int *B_ext_offd_i = NULL;
HYPRE_Int *B_ext_offd_j = NULL;
HYPRE_BigInt *B_ext_offd_bigj = NULL;
HYPRE_Complex *B_ext_offd_data = NULL;

HYPRE_Int *my_diag_array;
HYPRE_Int *my_offd_array;
HYPRE_BigInt *temp = NULL;
HYPRE_Int max_num_threads;
HYPRE_Int num_threads = hypre_NumOptimalThreads(num_rows_Bext);

HYPRE_Int cnt = 0;
hypre_CSRMatrix *Bext_diag = NULL;
hypre_CSRMatrix *Bext_offd = NULL;
HYPRE_BigInt *col_map_offd_C = NULL;
HYPRE_Int num_cols_offd_C = 0;

B_ext_diag_i = hypre_CTAlloc(HYPRE_Int, num_rows_Bext + 1, HYPRE_MEMORY_HOST);
B_ext_offd_i = hypre_CTAlloc(HYPRE_Int, num_rows_Bext + 1, HYPRE_MEMORY_HOST);

max_num_threads = hypre_NumThreads();
my_diag_array = hypre_CTAlloc(HYPRE_Int, max_num_threads, HYPRE_MEMORY_HOST);
my_offd_array = hypre_CTAlloc(HYPRE_Int, max_num_threads, HYPRE_MEMORY_HOST);
B_ext_diag_i = hypre_CTAlloc(HYPRE_Int, num_rows_Bext + 1, HYPRE_MEMORY_HOST);
B_ext_offd_i = hypre_CTAlloc(HYPRE_Int, num_rows_Bext + 1, HYPRE_MEMORY_HOST);
my_diag_array = hypre_CTAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST);
my_offd_array = hypre_CTAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST);

#ifdef HYPRE_USING_OPENMP
#pragma omp parallel
#pragma omp parallel num_threads(num_threads)
#endif
{
HYPRE_Int ns, ne, ii, num_threads;
HYPRE_Int ns, ne, ii;
HYPRE_Int i1, i, j;
HYPRE_Int my_offd_size, my_diag_size;
HYPRE_Int cnt_offd, cnt_diag;

ii = hypre_GetThreadNum();
num_threads = hypre_NumActiveThreads();
hypre_partition1D(num_rows_Bext, num_threads, ii, &ns, &ne);

my_diag_size = 0;
Expand Down Expand Up @@ -1536,13 +1551,13 @@ hypre_CSRMatrixSplit(hypre_CSRMatrix *Bs_ext,
HYPRE_Int
hypre_CSRMatrixReorderHost(hypre_CSRMatrix *A)
{
HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
HYPRE_Int *A_i = hypre_CSRMatrixI(A);
HYPRE_Int *A_j = hypre_CSRMatrixJ(A);
HYPRE_Int *rownnz_A = hypre_CSRMatrixRownnz(A);
HYPRE_Int nnzrows_A = hypre_CSRMatrixNumRownnz(A);
HYPRE_Int num_rows_A = hypre_CSRMatrixNumRows(A);
HYPRE_Int num_cols_A = hypre_CSRMatrixNumCols(A);
HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
HYPRE_Int *A_i = hypre_CSRMatrixI(A);
HYPRE_Int *A_j = hypre_CSRMatrixJ(A);
HYPRE_Int *rownnz_A = hypre_CSRMatrixRownnz(A);
HYPRE_Int nnzrows_A = hypre_CSRMatrixNumRownnz(A);
HYPRE_Int num_rows_A = hypre_CSRMatrixNumRows(A);
HYPRE_Int num_cols_A = hypre_CSRMatrixNumCols(A);

HYPRE_Int i, ii, j;

Expand Down
9 changes: 9 additions & 0 deletions src/utilities/HYPRE_utilities.h
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,15 @@ HYPRE_Int HYPRE_SetUseGpuRand( HYPRE_Int use_curand );
**/
HYPRE_Int HYPRE_SetGpuAwareMPI( HYPRE_Int use_gpu_aware_mpi );

/**
* Sets the number of threads to use in parallel regions.
* Must be called from outside of a parallel region.
*
* @param num_threads The number of threads to use
* @return Returns hypre's global error code
*/
HYPRE_Int HYPRE_SetNumThreads( HYPRE_Int num_threads );

/*--------------------------------------------------------------------------
* Base objects
*--------------------------------------------------------------------------*/
Expand Down
Loading