149 MPI_Comm comm = local_group_comm.comm;
155 int order = local_group_comm.start < remote_group_comm.start;
157 for (
int i = 0; i < 2; ++i) {
162 &data_size, 1, MPI_INT, remote_group_comm.start, local_group_comm);
163 recv_buffer =
xmalloc((
size_t)data_size);
165 remote_group_comm.start, local_group_comm);
168 if (comm_rank == local_group_comm.start) {
171 int pack_buffer_size =
173 void * pack_buffer =
xmalloc((
size_t)pack_buffer_size);
176 local_data, pack_buffer, pack_buffer_size, &position, comm);
180 &position, 1, MPI_INT, local_group_comm.start, remote_group_comm);
184 local_group_comm.start, remote_group_comm);
194 recv_buffer, data_size, &position, comm);
201 uint64_t comm_rank, uint64_t split_rank, uint64_t comm_size,
202 uint64_t global_sizes[2], uint64_t (*all_bucket_sizes)[2],
203 int * counts,
int * displs,
size_t * recv_count) {
205 int color = comm_rank >= split_rank;
207 uint64_t split_comm_size = split_rank;
208 uint64_t split_comm_rank = comm_rank;
210 split_comm_rank -= split_comm_size;
211 split_comm_size = comm_size - split_comm_size;
214 uint64_t global_size = global_sizes[color];
215 uint64_t local_interval_start =
216 (global_size * split_comm_rank + split_comm_size - 1) /
218 uint64_t local_interval_end =
219 (global_size * (split_comm_rank+1) + split_comm_size - 1) /
222 *recv_count = (size_t)(local_interval_end - local_interval_start);
224 for (uint64_t i = 0, start_idx = 0; i < comm_size; ++i) {
226 uint64_t next_start_idx = start_idx + all_bucket_sizes[i][color];
227 uint64_t interval_start =
MAX(start_idx, local_interval_start);
228 uint64_t interval_end =
MIN(next_start_idx, local_interval_end);
230 if (interval_start < interval_end) {
232 uint64_t count = interval_end - interval_start;
233 uint64_t disp = interval_start - local_interval_start;
236 (count <= INT_MAX) && (disp <= INT_MAX),
237 "ERROR(compute_redist_recvcounts_rdispls): invalid interval")
239 counts[i] = (int)count;
240 displs[i] = (int)disp;
246 start_idx = next_start_idx;
251 uint64_t comm_rank, uint64_t split_rank, uint64_t comm_size,
252 uint64_t global_sizes[2], uint64_t (*all_bucket_sizes)[2],
253 uint64_t U_size,
int * counts,
int * displs) {
255 uint64_t local_interval_start[2] = {0, 0};
256 uint64_t local_interval_end[2];
257 for (uint64_t i = 0; i < comm_rank; ++i)
258 for (
int j = 0; j < 2; ++j)
259 local_interval_start[j] += all_bucket_sizes[i][j];
260 for (
int j = 0; j < 2; ++j)
261 local_interval_end[j] =
262 local_interval_start[j] + all_bucket_sizes[comm_rank][j];
264 uint64_t comm_sizes[2] = {split_rank, comm_size - split_rank};
266 for (uint64_t i = 0, start_idx[2] = {0,0}; i < comm_size; ++i) {
268 int color = i >= split_rank;
269 uint64_t global_size = global_sizes[color];
270 uint64_t split_comm_rank = i - (color?(split_rank):0);
271 uint64_t split_comm_size = comm_sizes[color];
272 uint64_t next_start_idx =
273 (global_size * (split_comm_rank + 1) + split_comm_size - 1) /
275 uint64_t interval_start =
MAX(start_idx[color], local_interval_start[color]);
276 uint64_t interval_end =
MIN(next_start_idx, local_interval_end[color]);
278 if (interval_start < interval_end) {
280 uint64_t count = interval_end - interval_start;
281 uint64_t disp = interval_start - local_interval_start[color] +
282 ((color)?(U_size):(0));
285 (count <= INT_MAX) && (disp <= INT_MAX),
286 "ERROR(compute_redist_sendcounts_sdispls): invalid interval")
288 counts[i] = (int)count;
289 displs[i] = (int)disp;
295 start_idx[color] = next_start_idx;
329 struct dist_cell ** cells,
size_t * num_cells,
331 MPI_Datatype dist_cell_dt) {
333#ifdef SCOREP_USER_ENABLE
334SCOREP_USER_REGION_DEFINE( local_balance_point_region )
335SCOREP_USER_REGION_DEFINE( global_balance_point_region )
336SCOREP_USER_REGION_DEFINE( splitting_region )
337SCOREP_USER_REGION_DEFINE( comm_split_region )
338SCOREP_USER_REGION_DEFINE( redist_data_region )
349#ifdef SCOREP_USER_ENABLE
350SCOREP_USER_REGION_BEGIN(
351 local_balance_point_region,
"local balance point",
352 SCOREP_USER_REGION_TYPE_COMMON )
355 double balance_point[3] = {0.0, 0.0, 0.0};
356 for (
size_t i = 0; i < *num_cells; ++i) {
357 double * cell_coord = (*cells)[i].coord;
358 for (
int j = 0; j < 3; ++j) balance_point[j] += cell_coord[j];
360#ifdef SCOREP_USER_ENABLE
361SCOREP_USER_REGION_END( local_balance_point_region )
362SCOREP_USER_REGION_BEGIN(
363 global_balance_point_region,
"global balance point",
364 SCOREP_USER_REGION_TYPE_COMMON )
371 if ((fabs(balance_point[0]) > 1e-9) ||
372 (fabs(balance_point[1]) > 1e-9) ||
373 (fabs(balance_point[2]) > 1e-9)) {
376 balance_point[0] = prev_gc_norm_vector[2];
377 balance_point[1] = prev_gc_norm_vector[0];
378 balance_point[2] = prev_gc_norm_vector[1];
394#ifdef SCOREP_USER_ENABLE
395SCOREP_USER_REGION_END( global_balance_point_region )
402#ifdef SCOREP_USER_ENABLE
403SCOREP_USER_REGION_BEGIN( splitting_region,
"splitting data", SCOREP_USER_REGION_TYPE_COMMON )
412 struct dist_cell * left = *cells, * right = *cells + *num_cells - 1;
418 while (left <= right) {
419 double * curr_coordinates_xyz = left->
coord;
420 double dot = curr_coordinates_xyz[0] * gc_norm_vector[0] +
421 curr_coordinates_xyz[1] * gc_norm_vector[1] +
422 curr_coordinates_xyz[2] * gc_norm_vector[2];
425 if (dot > 0.0)
break;
430 while (left < right) {
431 double * curr_coordinates_xyz = right->
coord;
432 double dot = curr_coordinates_xyz[0] * gc_norm_vector[0] +
433 curr_coordinates_xyz[1] * gc_norm_vector[1] +
434 curr_coordinates_xyz[2] * gc_norm_vector[2];
437 if (dot <= 0.0)
break;
453 uint64_t U_size = (uint64_t)(left - *cells);
454 uint64_t T_size = (uint64_t)(*num_cells) - U_size;
456#ifdef SCOREP_USER_ENABLE
457SCOREP_USER_REGION_END( splitting_region )
461 uint64_t bucket_sizes[2] = {U_size, T_size};
464 uint64_t (*all_bucket_sizes)[2] =
465 xmalloc((
size_t)group_size *
sizeof(*all_bucket_sizes));
467 &(bucket_sizes[0]), &(all_bucket_sizes[0][0]), 2, group_comm);
470 uint64_t global_bucket_sizes[2] = {0, 0};
471 for (
int i = 0; i < group_size; ++i)
472 for (
int j = 0; j < 2; ++j)
473 global_bucket_sizes[j] += all_bucket_sizes[i][j];
474 uint64_t global_num_cells = global_bucket_sizes[0] + global_bucket_sizes[1];
479#ifdef SCOREP_USER_ENABLE
480SCOREP_USER_REGION_BEGIN(
481 comm_split_region,
"creating splitcomm", SCOREP_USER_REGION_TYPE_COMMON )
487 ((global_bucket_sizes[0] * (uint64_t)group_size + global_num_cells/2) /
488 global_num_cells), 1),
494 group_comm, split_rank, &local_group_comm, &remote_group_comm);
496#ifdef SCOREP_USER_ENABLE
497SCOREP_USER_REGION_END( comm_split_region )
503 int * int_buffer =
xmalloc(4 * (
size_t)group_size *
sizeof(*int_buffer));
504 int * sendcounts = int_buffer + 0 * group_size;
505 int * recvcounts = int_buffer + 1 * group_size;
506 int * sdispls = int_buffer + 2 * group_size;
507 int * rdispls = int_buffer + 3 * group_size;
509#ifdef SCOREP_USER_ENABLE
510SCOREP_USER_REGION_BEGIN(
511 redist_data_region,
"data redistribution", SCOREP_USER_REGION_TYPE_COMMON )
516 (uint64_t)group_rank, (uint64_t)split_rank, (uint64_t)group_size,
517 global_bucket_sizes, all_bucket_sizes, (uint64_t)U_size, sendcounts, sdispls);
518 size_t new_num_cells;
520 (uint64_t)group_rank, (uint64_t)split_rank, (uint64_t)group_size,
521 global_bucket_sizes, all_bucket_sizes, recvcounts, rdispls, &new_num_cells);
526 *cells, sendcounts, sdispls, new_cells, recvcounts, rdispls,
527 sizeof(**cells), dist_cell_dt, group_comm);
528#ifdef SCOREP_USER_ENABLE
529SCOREP_USER_REGION_END( redist_data_region )
533 *num_cells = new_num_cells;
536 free(all_bucket_sizes);
548 cells, num_cells, local_group_comm, gc_norm_vector, dist_cell_dt);
563 if (group_rank < split_rank) {
564 node->U = local_data;
565 node->
T = remote_data;
567 node->U = remote_data;
568 node->
T = local_data;
583 struct dist_cell ** cells,
size_t * num_cells, MPI_Comm comm) {
585 double base_gc_norm_vector[3] = {0.0,0.0,1.0};
587 int comm_rank, comm_size;
591 uint64_t global_num_cells = (uint64_t)*num_cells;
594 MPI_IN_PLACE, &global_num_cells, 1, MPI_UINT64_T, MPI_SUM, comm), comm);
598 if ((comm_size > 1) && (global_num_cells > 0)) {
604 base_gc_norm_vector, dist_cell_dt);
609 proc_sphere_part =
xmalloc(1 *
sizeof(*proc_sphere_part));
610 proc_sphere_part->U.data.rank = 0;
611 proc_sphere_part->U.is_leaf = 1;
619 return proc_sphere_part;