149 MPI_Comm comm = local_group_comm.comm;
153 void * recv_buffer = NULL;
155 int order = local_group_comm.start < remote_group_comm.start;
157 for (
int i = 0; i < 2; ++i) {
162 &data_size, 1, MPI_INT, remote_group_comm.start, local_group_comm);
163 recv_buffer =
xmalloc((
size_t)data_size);
165 remote_group_comm.start, local_group_comm);
168 if (comm_rank == local_group_comm.start) {
171 int pack_buffer_size =
173 void * pack_buffer =
xmalloc((
size_t)pack_buffer_size);
176 local_data, pack_buffer, pack_buffer_size, &position, comm);
180 &position, 1, MPI_INT, local_group_comm.start, remote_group_comm);
184 local_group_comm.start, remote_group_comm);
194 recv_buffer, data_size, &position, comm);
201 uint64_t comm_rank, uint64_t split_rank, uint64_t comm_size,
202 uint64_t global_sizes[2], uint64_t (*all_bucket_sizes)[2],
203 int * counts,
int * displs,
size_t * recv_count) {
205 int color = comm_rank >= split_rank;
207 uint64_t split_comm_size = split_rank;
208 uint64_t split_comm_rank = comm_rank;
210 split_comm_rank -= split_comm_size;
211 split_comm_size = comm_size - split_comm_size;
214 uint64_t global_size = global_sizes[color];
215 uint64_t local_interval_start =
216 (global_size * split_comm_rank + split_comm_size - 1) /
218 uint64_t local_interval_end =
219 (global_size * (split_comm_rank+1) + split_comm_size - 1) /
222 *recv_count = (size_t)(local_interval_end - local_interval_start);
224 for (uint64_t i = 0, start_idx = 0; i < comm_size; ++i) {
226 uint64_t next_start_idx = start_idx + all_bucket_sizes[i][color];
227 uint64_t interval_start =
MAX(start_idx, local_interval_start);
228 uint64_t interval_end =
MIN(next_start_idx, local_interval_end);
230 if (interval_start < interval_end) {
232 uint64_t count = interval_end - interval_start;
233 uint64_t disp = interval_start - local_interval_start;
236 (count <= INT_MAX) && (disp <= INT_MAX),
237 "ERROR(compute_redist_recvcounts_rdispls): invalid interval")
239 counts[i] = (int)count;
240 displs[i] = (int)disp;
246 start_idx = next_start_idx;
251 uint64_t comm_rank, uint64_t split_rank, uint64_t comm_size,
252 uint64_t global_sizes[2], uint64_t (*all_bucket_sizes)[2],
253 uint64_t U_size,
int * counts,
int * displs) {
255 uint64_t local_interval_start[2] = {0, 0};
256 uint64_t local_interval_end[2];
257 for (uint64_t i = 0; i < comm_rank; ++i)
258 for (
int j = 0; j < 2; ++j)
259 local_interval_start[j] += all_bucket_sizes[i][j];
260 for (
int j = 0; j < 2; ++j)
261 local_interval_end[j] =
262 local_interval_start[j] + all_bucket_sizes[comm_rank][j];
264 uint64_t comm_sizes[2] = {split_rank, comm_size - split_rank};
266 for (uint64_t i = 0, start_idx[2] = {0,0}; i < comm_size; ++i) {
268 int color = i >= split_rank;
269 uint64_t global_size = global_sizes[color];
270 uint64_t split_comm_rank = i - (color?(split_rank):0);
271 uint64_t split_comm_size = comm_sizes[color];
272 uint64_t next_start_idx =
273 (global_size * (split_comm_rank + 1) + split_comm_size - 1) /
275 uint64_t interval_start =
MAX(start_idx[color], local_interval_start[color]);
276 uint64_t interval_end =
MIN(next_start_idx, local_interval_end[color]);
278 if (interval_start < interval_end) {
280 uint64_t count = interval_end - interval_start;
281 uint64_t disp = interval_start - local_interval_start[color] +
282 ((color)?(U_size):(0));
285 (count <= INT_MAX) && (disp <= INT_MAX),
286 "ERROR(compute_redist_sendcounts_sdispls): invalid interval")
288 counts[i] = (int)count;
289 displs[i] = (int)disp;
295 start_idx[color] = next_start_idx;
331 struct dist_vertex ** vertices,
size_t * num_vertices,
333 MPI_Datatype dist_vertex_dt) {
335#ifdef SCOREP_USER_ENABLE
336SCOREP_USER_REGION_DEFINE( local_balance_point_region )
337SCOREP_USER_REGION_DEFINE( global_balance_point_region )
338SCOREP_USER_REGION_DEFINE( splitting_region )
339SCOREP_USER_REGION_DEFINE( comm_split_region )
340SCOREP_USER_REGION_DEFINE( redist_data_region )
351#ifdef SCOREP_USER_ENABLE
352SCOREP_USER_REGION_BEGIN(
353 local_balance_point_region,
"local balance point",
354 SCOREP_USER_REGION_TYPE_COMMON )
357 double balance_point[3] = {0.0, 0.0, 0.0};
358 for (
size_t i = 0; i < *num_vertices; ++i) {
359 double * vertex_coord = (*vertices)[i].coord;
360 for (
int j = 0; j < 3; ++j) balance_point[j] += vertex_coord[j];
362#ifdef SCOREP_USER_ENABLE
363SCOREP_USER_REGION_END( local_balance_point_region )
364SCOREP_USER_REGION_BEGIN(
365 global_balance_point_region,
"global balance point",
366 SCOREP_USER_REGION_TYPE_COMMON )
373 if ((fabs(balance_point[0]) > 1e-9) ||
374 (fabs(balance_point[1]) > 1e-9) ||
375 (fabs(balance_point[2]) > 1e-9)) {
378 balance_point[0] = prev_gc_norm_vector[2];
379 balance_point[1] = prev_gc_norm_vector[0];
380 balance_point[2] = prev_gc_norm_vector[1];
396#ifdef SCOREP_USER_ENABLE
397SCOREP_USER_REGION_END( global_balance_point_region )
404#ifdef SCOREP_USER_ENABLE
405SCOREP_USER_REGION_BEGIN( splitting_region,
"splitting data", SCOREP_USER_REGION_TYPE_COMMON )
414 struct dist_vertex * left = *vertices, * right = *vertices+ *num_vertices - 1;
420 while (left <= right) {
421 double * curr_coordinates_xyz = left->
coord;
422 double dot = curr_coordinates_xyz[0] * gc_norm_vector[0] +
423 curr_coordinates_xyz[1] * gc_norm_vector[1] +
424 curr_coordinates_xyz[2] * gc_norm_vector[2];
427 if (dot > 0.0)
break;
432 while (left < right) {
433 double * curr_coordinates_xyz = right->
coord;
434 double dot = curr_coordinates_xyz[0] * gc_norm_vector[0] +
435 curr_coordinates_xyz[1] * gc_norm_vector[1] +
436 curr_coordinates_xyz[2] * gc_norm_vector[2];
439 if (dot <= 0.0)
break;
455 uint64_t U_size = (uint64_t)(left - *vertices);
456 uint64_t T_size = (uint64_t)(*num_vertices) - U_size;
458#ifdef SCOREP_USER_ENABLE
459SCOREP_USER_REGION_END( splitting_region )
463 uint64_t bucket_sizes[2] = {U_size, T_size};
466 uint64_t (*all_bucket_sizes)[2] =
467 xmalloc((
size_t)group_size *
sizeof(*all_bucket_sizes));
469 &(bucket_sizes[0]), &(all_bucket_sizes[0][0]), 2, group_comm);
472 uint64_t global_bucket_sizes[2] = {0, 0};
473 for (
int i = 0; i < group_size; ++i)
474 for (
int j = 0; j < 2; ++j)
475 global_bucket_sizes[j] += all_bucket_sizes[i][j];
476 uint64_t global_num_vertices =
477 global_bucket_sizes[0] + global_bucket_sizes[1];
482#ifdef SCOREP_USER_ENABLE
483SCOREP_USER_REGION_BEGIN(
484 comm_split_region,
"creating splitcomm", SCOREP_USER_REGION_TYPE_COMMON )
490 ((global_bucket_sizes[0] * (uint64_t)group_size +
491 global_num_vertices/2) / global_num_vertices), 1),
497 group_comm, split_rank, &local_group_comm, &remote_group_comm);
499#ifdef SCOREP_USER_ENABLE
500SCOREP_USER_REGION_END( comm_split_region )
506 int * int_buffer =
xmalloc(4 * (
size_t)group_size *
sizeof(*int_buffer));
507 int * sendcounts = int_buffer + 0 * group_size;
508 int * recvcounts = int_buffer + 1 * group_size;
509 int * sdispls = int_buffer + 2 * group_size;
510 int * rdispls = int_buffer + 3 * group_size;
512#ifdef SCOREP_USER_ENABLE
513SCOREP_USER_REGION_BEGIN(
514 redist_data_region,
"data redistribution", SCOREP_USER_REGION_TYPE_COMMON )
519 (uint64_t)group_rank, (uint64_t)split_rank, (uint64_t)group_size,
520 global_bucket_sizes, all_bucket_sizes, (uint64_t)U_size, sendcounts, sdispls);
521 size_t new_num_vertices;
523 (uint64_t)group_rank, (uint64_t)split_rank, (uint64_t)group_size,
524 global_bucket_sizes, all_bucket_sizes, recvcounts, rdispls,
529 xmalloc(new_num_vertices *
sizeof(*new_vertices));
531 *vertices, sendcounts, sdispls, new_vertices, recvcounts, rdispls,
532 sizeof(**vertices), dist_vertex_dt, group_comm);
533#ifdef SCOREP_USER_ENABLE
534SCOREP_USER_REGION_END( redist_data_region )
537 *vertices = new_vertices;
538 *num_vertices = new_num_vertices;
541 free(all_bucket_sizes);
553 vertices, num_vertices, local_group_comm, gc_norm_vector,
569 if (group_rank < split_rank) {
570 node->U = local_data;
571 node->
T = remote_data;
573 node->U = remote_data;
574 node->
T = local_data;
589 struct dist_vertex ** vertices,
size_t * num_vertices, MPI_Comm comm) {
591 double base_gc_norm_vector[3] = {0.0,0.0,1.0};
593 int comm_rank, comm_size;
597 uint64_t global_num_vertices = (uint64_t)*num_vertices;
600 MPI_IN_PLACE, &global_num_vertices, 1, MPI_UINT64_T, MPI_SUM, comm), comm);
604 if ((comm_size > 1) && (global_num_vertices > 0)) {
610 vertices, num_vertices, group_comm, base_gc_norm_vector,
616 proc_sphere_part =
xmalloc(1 *
sizeof(*proc_sphere_part));
617 proc_sphere_part->U.data.rank = 0;
618 proc_sphere_part->U.is_leaf = 1;
626 return proc_sphere_part;