49#ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50#define _ZOLTAN2_ALGMultiJagged_HPP_
59#include <Tpetra_Distributor.hpp>
60#include <Teuchos_StandardParameterEntryValidators.hpp>
61#include <Teuchos_ParameterList.hpp>
62#include <Kokkos_Sort.hpp>
66#include <unordered_map>
68#ifdef ZOLTAN2_USEZOLTANCOMM
69#ifdef HAVE_ZOLTAN2_MPI
70#define ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
71#include "zoltan_comm_cpp.h"
72#include "zoltan_types.h"
81template <
typename Ordinal,
typename T>
105 void reduce(
const Ordinal count,
const T inBuffer[], T inoutBuffer[])
const {
106 for(Ordinal i = 0; i < count; i++) {
108 inoutBuffer[i] = inBuffer[i];
124template <
typename IT,
typename CT,
typename WT>
139 this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
144 this->index = index_;
145 this->count = count_;
147 this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
153 void set(IT index_ ,CT count_, WT *vals_) {
154 this->index = index_;
155 this->count = count_;
160 assert(this->count == other.
count);
161 for(CT i = 0; i < this->
count; ++i) {
163 if(std::abs(this->val[i] - other.
val[i]) < this->epsilon) {
167 if(this->val[i] < other.
val[i]) {
176 return this->index < other.
index;
182template <
class IT,
class WT>
193template <
class IT,
class WT>
195 const int NSTACK = 50;
197 IT i, ir=n, j, k, l=1;
198 IT jstack=0, istack[NSTACK];
205 for(j=l+1;j<=ir;j++) {
208 for(i=j-1;i>=1;i--) {
209 if(arr[i].val <= aval)
222 std::swap(arr[k],arr[l+1]);
223 if(arr[l+1].val > arr[ir].val) {
224 std::swap(arr[l+1],arr[ir]);
226 if(arr[l].val > arr[ir].val) {
227 std::swap(arr[l],arr[ir]);
229 if(arr[l+1].val > arr[l].val) {
230 std::swap(arr[l+1],arr[l]);
237 do i++;
while (arr[i].val < aval);
238 do j--;
while (arr[j].val > aval);
240 std::swap(arr[i],arr[j]);
245 if(jstack > NSTACK) {
246 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
263template <
class IT,
class WT,
class SIGN>
271 if(this->signbit < rhs.
signbit) {
275 else if(this->signbit == rhs.
signbit) {
276 if(this->val < rhs.
val) {
280 else if(this->val > rhs.
val) {
295 return (this->val == rhs.
val && this->signbit == rhs.
signbit) || (*
this < rhs);
302template <
class IT,
class WT,
class SIGN>
304 const IT NSTACK = 50;
306 IT i, ir=n, j, k, l=1;
307 IT jstack=0, istack[NSTACK];
313 for(j=l+1;j<=ir;j++) {
315 for(i=j-1;i>=1;i--) {
331 std::swap(arr[k],arr[l+1]);
332 if(arr[ir] < arr[l+1]) {
333 std::swap(arr[l+1],arr[ir]);
335 if(arr[ir] < arr[l] ) {
336 std::swap(arr[l],arr[ir]);
338 if(arr[l] < arr[l+1]) {
339 std::swap(arr[l+1],arr[l]);
345 do i++;
while (arr[i] < a);
346 do j--;
while (a < arr[j]);
348 std::swap(arr[i],arr[j]);
353 if(jstack > NSTACK) {
354 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
390 static int counter = 0;
394 static int counter = 0;
401template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
402 typename mj_part_t,
typename mj_node_t>
406 typedef typename mj_node_t::device_type device_t;
408 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
413 static constexpr size_t future_reduceall_cutoff = 1500000;
417 static constexpr mj_lno_t min_work_last_dim = 1000;
419 static constexpr mj_scalar_t least_signifiance = 0.0001;
420 static constexpr int significance_mul = 1000;
422 std::string mj_timer_base_string;
424 RCP<const Environment> mj_env;
425 RCP<const Comm<int> > mj_problemComm;
426 RCP<Comm<int> > comm;
427 double imbalance_tolerance;
430 int num_weights_per_coord;
431 size_t initial_num_loc_coords;
433 mj_lno_t num_local_coords;
434 mj_gno_t num_global_coords;
435 mj_scalar_t sEpsilon;
438 bool distribute_points_on_cut_lines;
441 mj_part_t max_concurrent_part_calculation;
444 int mj_user_recursion_depth;
445 bool mj_keep_part_boxes;
448 int check_migrate_avoid_migration_option;
455 double minimum_migration_imbalance;
472 mj_part_t num_first_level_parts;
476 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
478 mj_part_t total_num_cut ;
479 mj_part_t total_num_part;
481 mj_part_t max_num_part_along_dim ;
482 mj_part_t max_num_cut_along_dim;
485 size_t max_num_total_part_along_dim;
487 mj_part_t total_dim_num_reduce_all;
491 mj_part_t last_dim_num_part;
494 Kokkos::View<mj_part_t *, Kokkos::HostSpace> part_no_array;
498 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
502 Kokkos::View<mj_scalar_t **, device_t> mj_weights;
505 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_parts;
508 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_weights;
512 size_t num_global_parts;
515 RCP<mj_partBoxVector_t> kept_boxes;
517 RCP<mj_partBox_t> global_box;
522 bool divide_to_prime_first;
525 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
528 Kokkos::View<mj_gno_t*, device_t> current_mj_gnos;
531 Kokkos::View<int*, Kokkos::HostSpace> owner_of_coordinate;
534 Kokkos::View<mj_lno_t*, device_t> coordinate_permutations;
537 Kokkos::View<mj_lno_t*, device_t> new_coordinate_permutations;
540 Kokkos::View<mj_part_t*, device_t> assigned_part_ids;
543 Kokkos::View<mj_lno_t *, device_t> part_xadj;
546 Kokkos::View<mj_lno_t *, device_t> new_part_xadj;
548 Kokkos::View<mj_scalar_t *, device_t> all_cut_coordinates;
551 Kokkos::View<mj_scalar_t *, device_t>
552 process_cut_line_weight_to_put_left;
555 Kokkos::View<mj_scalar_t *, device_t>
556 thread_cut_line_weight_to_put_left;
562 Kokkos::View<mj_scalar_t *, device_t> cut_coordinates_work_array;
565 Kokkos::View<mj_scalar_t *, device_t> temp_cut_coords;
568 Kokkos::View<mj_scalar_t *, device_t> target_part_weights;
571 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_coordinates;
574 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_coordinates;
577 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_weights;
580 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_weights;
584 Kokkos::View<mj_scalar_t *, device_t>
585 process_local_min_max_coord_total_weight;
588 Kokkos::View<mj_scalar_t *, device_t>
589 global_min_max_coord_total_weight;
593 Kokkos::View<bool *, device_t> is_cut_line_determined;
599 Kokkos::View<mj_part_t *, device_t> device_incomplete_cut_count;
600 typename decltype(device_incomplete_cut_count)::HostMirror
601 incomplete_cut_count;
604 typename decltype (part_xadj)::HostMirror host_part_xadj;
607 Kokkos::View<double *, device_t>
611 Kokkos::View<double *, device_t>
612 thread_part_weight_work;
616 Kokkos::View<mj_scalar_t *, device_t>
617 thread_cut_left_closest_point;
621 Kokkos::View<mj_scalar_t *, device_t>
622 thread_cut_right_closest_point;
625 Kokkos::View<mj_lno_t *, device_t>
628 Kokkos::View<mj_scalar_t *, device_t> process_rectilinear_cut_weight;
629 Kokkos::View<mj_scalar_t *, device_t> global_rectilinear_cut_weight;
635 Kokkos::View<mj_scalar_t *, device_t>
636 total_part_weight_left_right_closests;
637 Kokkos::View<mj_scalar_t *, device_t>
638 global_total_part_weight_left_right_closests;
640 Kokkos::View<mj_part_t*, device_t> device_num_partitioning_in_current_dim;
641 typename decltype(device_num_partitioning_in_current_dim)::HostMirror
642 host_num_partitioning_in_current_dim;
649 KOKKOS_INLINE_FUNCTION
650 double calculate_imbalance(mj_scalar_t achieved, mj_scalar_t expected) {
651 return static_cast<double>(achieved) /
static_cast<double>(expected) - 1.0;
660 void set_part_specifications();
667 inline mj_part_t get_part_count(
668 mj_part_t num_total_future,
677 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
698 mj_part_t update_part_num_arrays(
699 std::vector<mj_part_t> *future_num_part_in_parts,
700 std::vector<mj_part_t> *next_future_num_parts_in_parts,
701 mj_part_t &future_num_parts,
702 mj_part_t current_num_parts,
703 int current_iteration,
704 RCP<mj_partBoxVector_t> input_part_boxes,
705 RCP<mj_partBoxVector_t> output_part_boxes,
706 mj_part_t atomic_part_count);
720 KOKKOS_INLINE_FUNCTION
721 void mj_calculate_new_cut_position (
722 mj_scalar_t cut_upper_bound,
723 mj_scalar_t cut_lower_bound,
724 mj_scalar_t cut_upper_weight,
725 mj_scalar_t cut_lower_weight,
726 mj_scalar_t expected_weight,
727 mj_scalar_t &new_cut_position,
728 mj_scalar_t sEpsilon);
754 bool mj_perform_migration(
755 mj_part_t in_num_parts,
756 mj_part_t &out_num_parts,
757 std::vector<mj_part_t> *next_future_num_parts_in_parts,
758 mj_part_t &output_part_begin_index,
759 size_t migration_reduce_all_population,
760 mj_lno_t num_coords_for_last_dim_part,
761 std::string iteration,
762 RCP<mj_partBoxVector_t> &input_part_boxes,
763 RCP<mj_partBoxVector_t> &output_part_boxes);
782 bool mj_check_to_migrate(
783 size_t migration_reduce_all_population,
784 mj_lno_t num_coords_for_last_dim_part,
787 mj_gno_t *num_points_in_all_processor_parts);
813 void mj_migration_part_proc_assignment(
814 mj_gno_t * num_points_in_all_processor_parts,
817 mj_lno_t *send_count_to_each_proc,
818 std::vector<mj_part_t> &processor_ranks_for_subcomm,
819 std::vector<mj_part_t> *next_future_num_parts_in_parts,
820 mj_part_t &out_num_part,
821 std::vector<mj_part_t> &out_part_indices,
822 mj_part_t &output_part_numbering_begin_index,
823 int *coordinate_destinations);
850 void mj_assign_proc_to_parts(
851 mj_gno_t * num_points_in_all_processor_parts,
854 mj_lno_t *send_count_to_each_proc,
855 std::vector<mj_part_t> &processor_ranks_for_subcomm,
856 std::vector<mj_part_t> *next_future_num_parts_in_parts,
857 mj_part_t &out_part_index,
858 mj_part_t &output_part_numbering_begin_index,
859 int *coordinate_destinations);
876 void assign_send_destinations(
878 mj_part_t *part_assignment_proc_begin_indices,
879 mj_part_t *processor_chains_in_parts,
880 mj_lno_t *send_count_to_each_proc,
881 int *coordinate_destinations);
897 void assign_send_destinations2(
900 int *coordinate_destinations,
901 mj_part_t &output_part_numbering_begin_index,
902 std::vector<mj_part_t> *next_future_num_parts_in_parts);
926 void mj_assign_parts_to_procs(
927 mj_gno_t * num_points_in_all_processor_parts,
930 mj_lno_t *send_count_to_each_proc,
931 std::vector<mj_part_t> *next_future_num_parts_in_parts,
932 mj_part_t &out_num_part,
933 std::vector<mj_part_t> &out_part_indices,
934 mj_part_t &output_part_numbering_begin_index,
935 int *coordinate_destinations);
950 void mj_migrate_coords(
952 mj_lno_t &num_new_local_points,
953 std::string iteration,
954 int *coordinate_destinations,
955 mj_part_t num_parts);
962 void create_sub_communicator(
963 std::vector<mj_part_t> &processor_ranks_for_subcomm);
969 mj_part_t find_largest_prime_factor(mj_part_t num_parts) {
970 mj_part_t largest_factor = 1;
971 mj_part_t n = num_parts;
972 mj_part_t divisor = 2;
974 while (n % divisor == 0) {
976 largest_factor = divisor;
979 if(divisor * divisor > n) {
986 return largest_factor;
1019 const RCP<const Environment> &env,
1020 RCP<
const Comm<int> > &problemComm,
1021 double imbalance_tolerance,
1023 size_t num_global_parts,
1024 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array,
1025 int recursion_depth,
1027 mj_lno_t num_local_coords,
1028 mj_gno_t num_global_coords,
1029 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos,
1031 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates,
1032 int num_weights_per_coord,
1033 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights,
1034 Kokkos::View<mj_scalar_t**, device_t> & mj_weights,
1035 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts,
1036 Kokkos::View<mj_part_t*, device_t> & result_assigned_part_ids,
1037 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos);
1052 bool distribute_points_on_cut_lines_,
1053 int max_concurrent_part_calculation_,
1054 int check_migrate_avoid_migration_option_,
1055 double minimum_migration_imbalance_,
1056 int migration_type_ = 0);
1073 RCP<mj_partBoxVector_t> &localPartBoxes)
const;
1114 const RCP<const Environment> &env,
1115 mj_lno_t num_total_coords,
1116 mj_lno_t num_selected_coords,
1117 size_t num_target_part,
1120 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
1121 Kokkos::View<mj_lno_t *, device_t> &
1122 initial_selected_coords_output_permutation,
1123 mj_lno_t *output_xadj,
1124 int recursion_depth_,
1125 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array,
1126 bool partition_along_longest_dim,
1127 int num_ranks_per_node,
1128 bool divide_to_prime_first_,
1129 mj_part_t num_first_level_parts_ = 1,
1130 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_
1131 = Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1133#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
1141 void allocate_set_work_memory();
1144 void compute_global_box();
1153 void mj_get_local_min_max_coord_totW(
1154 mj_part_t current_work_part,
1155 mj_part_t current_concurrent_num_parts,
1156 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords);
1170 void mj_get_global_min_max_coord_totW(
1171 mj_part_t current_concurrent_num_parts,
1172 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
1173 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total);
1205 void mj_get_initial_cut_coords_target_weights(
1206 mj_scalar_t min_coord,
1207 mj_scalar_t max_coord,
1208 mj_part_t num_cuts ,
1209 mj_scalar_t global_weight,
1210 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
1211 Kokkos::View<mj_scalar_t *, device_t> & target_part_weights,
1212 std::vector <mj_part_t> *future_num_part_in_parts,
1213 std::vector <mj_part_t> *next_future_num_parts_in_parts,
1214 mj_part_t concurrent_current_part,
1215 mj_part_t obtained_part_index,
1216 mj_part_t num_target_first_level_parts = 1,
1217 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist =
1218 Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1236 void set_initial_coordinate_parts(
1237 mj_scalar_t &max_coordinate,
1238 mj_scalar_t &min_coordinate,
1239 mj_lno_t coordinate_begin_index,
1240 mj_lno_t coordinate_end_index,
1241 Kokkos::View<mj_lno_t *, device_t> &
1242 mj_current_coordinate_permutations,
1243 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1244 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
1245 mj_part_t &partition_count);
1264 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1265 double imbalanceTolerance,
1266 mj_part_t current_work_part,
1267 mj_part_t current_concurrent_num_parts,
1268 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1269 mj_part_t total_incomplete_cut_count,
1270 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
1271 Kokkos::View<size_t*, device_t> & view_total_reduction_size);
1278 void mj_1D_part_get_part_weights(
1279 mj_part_t current_concurrent_num_parts,
1280 mj_part_t current_work_part,
1281 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1291 void mj_combine_rightleft_and_weights(
1292 mj_part_t current_work_part,
1293 mj_part_t current_concurrent_num_parts);
1307 void mj_create_new_partitions(
1308 mj_part_t num_parts,
1309 mj_part_t current_concurrent_work_part,
1310 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1311 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1312 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1313 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj);
1350 void mj_get_new_cut_coordinates(
1351 mj_part_t current_concurrent_num_parts,
1353 const mj_part_t &num_cuts,
1354 const double &used_imbalance_tolerance,
1355 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
1356 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
1357 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
1358 Kokkos::View<bool *, device_t> & current_cut_line_determined,
1359 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1360 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
1361 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
1362 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
1363 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
1364 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
1365 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
1366 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
1367 Kokkos::View<mj_scalar_t *, device_t> &
1368 current_part_cut_line_weight_to_put_left,
1369 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count);
1380 void get_processor_num_points_in_parts(
1381 mj_part_t num_procs,
1382 mj_part_t num_parts,
1383 mj_gno_t *&num_points_in_all_processor_parts);
1389 void fill_permutation_array(
1390 mj_part_t output_num_parts,
1391 mj_part_t num_parts);
1414 void create_consistent_chunks(
1415 mj_part_t num_parts,
1416 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1417 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1418 mj_lno_t coordinate_begin,
1419 mj_lno_t coordinate_end,
1420 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1421 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
1423 bool longest_dim_part,
1434 void set_final_parts(
1435 mj_part_t current_num_parts,
1436 mj_part_t output_part_begin_index,
1437 RCP<mj_partBoxVector_t> &output_part_boxes,
1438 bool is_data_ever_migrated);
1443template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1444 typename mj_part_t,
typename mj_node_t>
1446 mj_env(), mj_problemComm(), comm(), imbalance_tolerance(0),
1447 recursion_depth(0), coord_dim(0),
1448 num_weights_per_coord(0), initial_num_loc_coords(0),
1449 initial_num_glob_coords(0),
1450 num_local_coords(0), num_global_coords(0),
1451 sEpsilon(std::numeric_limits<mj_scalar_t>::
epsilon() * 100),
1452 distribute_points_on_cut_lines(true),
1453 max_concurrent_part_calculation(1),
1454 mj_run_as_rcb(false), mj_user_recursion_depth(0),
1455 mj_keep_part_boxes(false),
1456 check_migrate_avoid_migration_option(0), migration_type(0),
1457 minimum_migration_imbalance(0.30),
1458 num_first_level_parts(1),
1459 total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1460 max_num_cut_along_dim(0),
1461 max_num_total_part_along_dim(0),
1462 total_dim_num_reduce_all(0),
1463 last_dim_num_part(0),
1465 num_global_parts(1),
1466 kept_boxes(), global_box(),
1467 myRank(0), myActualRank(0),
1468 divide_to_prime_first(false)
1515template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1516 typename mj_part_t,
typename mj_node_t>
1519 const RCP<const Environment> &env,
1520 mj_lno_t num_total_coords,
1521 mj_lno_t num_selected_coords,
1522 size_t num_target_part,
1525 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> &
1527 Kokkos::View<mj_lno_t *, device_t> & initial_adjList_output_adjlist,
1528 mj_lno_t *output_xadj,
1529 int recursion_depth_,
1530 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array_,
1531 bool partition_along_longest_dim,
1532 int num_ranks_per_node,
1533 bool divide_to_prime_first_,
1534 mj_part_t num_first_level_parts_,
1535 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_)
1538 const RCP<Comm<int> > commN;
1539 this->mj_problemComm = Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1540 this->comm = Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1541 this->myActualRank = this->myRank = 1;
1543 this->divide_to_prime_first = divide_to_prime_first_;
1548 this->imbalance_tolerance = 0;
1549 this->num_global_parts = num_target_part;
1550 this->part_no_array = part_no_array_;
1551 this->recursion_depth = recursion_depth_;
1555 this->num_first_level_parts = num_first_level_parts_;
1557 this->first_level_distribution = first_level_distribution_;
1559 this->coord_dim = coord_dim_;
1560 this->num_local_coords = num_total_coords;
1562 this->num_global_coords = num_total_coords;
1563 this->mj_coordinates = mj_coordinates_;
1566 this->initial_mj_gnos =
1567 Kokkos::View<mj_gno_t*, device_t>(
"gids", this->num_local_coords);
1569 this->num_weights_per_coord = 0;
1571 this->mj_uniform_weights = Kokkos::View<bool*, Kokkos::HostSpace>(
1572 "uniform weights", 1);
1573 this->mj_uniform_weights(0) =
true;
1575 this->mj_weights = Kokkos::View<mj_scalar_t**, device_t>
1578 this->mj_uniform_parts =
1579 Kokkos::View<bool*, Kokkos::HostSpace>(
"uniform parts", 1);
1580 this->mj_uniform_parts(0) =
true;
1582 this->set_part_specifications();
1584 this->allocate_set_work_memory();
1587 auto local_part_xadj = this->part_xadj;
1588 Kokkos::parallel_for(
1589 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
1590 KOKKOS_LAMBDA (
int dummy) {
1591 local_part_xadj(0) =
static_cast<mj_lno_t
>(num_selected_coords);
1594 Kokkos::deep_copy(coordinate_permutations, initial_adjList_output_adjlist);
1596 mj_part_t current_num_parts = 1;
1598 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
1599 this->all_cut_coordinates;
1601 mj_part_t future_num_parts = this->total_num_part;
1603 std::vector<mj_part_t> *future_num_part_in_parts =
1604 new std::vector<mj_part_t>();
1605 std::vector<mj_part_t> *next_future_num_parts_in_parts =
1606 new std::vector<mj_part_t>();
1607 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1608 RCP<mj_partBoxVector_t> t1;
1609 RCP<mj_partBoxVector_t> t2;
1611 std::vector <uSignedSortItem<int, mj_scalar_t, char>>
1612 coord_dimension_range_sorted(this->coord_dim);
1614 &(coord_dimension_range_sorted[0]);
1615 std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1616 std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1620 Kokkos::View<mj_part_t*, device_t>
1621 view_rectilinear_cut_count(
"view_rectilinear_cut_count", 1);
1622 Kokkos::View<size_t*, device_t>
1623 view_total_reduction_size(
"view_total_reduction_size", 1);
1625 for(
int rd = 0; rd < this->recursion_depth; ++rd) {
1631 std::vector<mj_part_t> *tmpPartVect = future_num_part_in_parts;
1632 future_num_part_in_parts = next_future_num_parts_in_parts;
1633 next_future_num_parts_in_parts = tmpPartVect;
1637 next_future_num_parts_in_parts->clear();
1640 mj_part_t output_part_count_in_dimension =
1641 this->update_part_num_arrays(
1642 future_num_part_in_parts,
1643 next_future_num_parts_in_parts,
1648 t2, num_ranks_per_node);
1653 if(output_part_count_in_dimension == current_num_parts) {
1654 tmpPartVect = future_num_part_in_parts;
1655 future_num_part_in_parts = next_future_num_parts_in_parts;
1656 next_future_num_parts_in_parts = tmpPartVect;
1661 std::string istring = std::to_string(rd);
1665 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
1666 "new part xadj", output_part_count_in_dimension);
1670 mj_part_t output_part_index = 0;
1674 mj_part_t output_coordinate_end_index = 0;
1676 mj_part_t current_work_part = 0;
1677 mj_part_t current_concurrent_num_parts = 1;
1679 mj_part_t obtained_part_index = 0;
1682 int coordInd = rd % this->coord_dim;
1684 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
1685 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1687 auto host_process_local_min_max_coord_total_weight =
1688 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
1689 auto host_global_min_max_coord_total_weight =
1690 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
1693 for(; current_work_part < current_num_parts;
1694 current_work_part += current_concurrent_num_parts) {
1696 mj_part_t actual_work_part_count = 0;
1701 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1702 mj_part_t current_work_part_in_concurrent_parts =
1703 current_work_part + kk;
1707 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1708 current_work_part_in_concurrent_parts);
1709 if(partition_count == 1) {
1712 ++actual_work_part_count;
1713 if(partition_along_longest_dim) {
1714 auto local_process_local_min_max_coord_total_weight =
1715 this->process_local_min_max_coord_total_weight;
1716 for(
int coord_traverse_ind = 0;
1717 coord_traverse_ind < this->coord_dim; ++coord_traverse_ind) {
1719 Kokkos::View<mj_scalar_t *, device_t> coords =
1720 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coord_traverse_ind);
1722 this->mj_get_local_min_max_coord_totW(
1724 current_concurrent_num_parts,
1727 coord_dimension_range_sorted[coord_traverse_ind].id =
1729 coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1731 Kokkos::deep_copy(host_process_local_min_max_coord_total_weight,
1732 process_local_min_max_coord_total_weight);
1734 coord_dim_mins[coord_traverse_ind] =
1735 host_process_local_min_max_coord_total_weight(kk);
1736 coord_dim_maxs[coord_traverse_ind] =
1737 host_process_local_min_max_coord_total_weight(
1738 kk + current_concurrent_num_parts);
1739 coord_dimension_range_sorted[coord_traverse_ind].val =
1740 host_process_local_min_max_coord_total_weight(
1741 kk + current_concurrent_num_parts) -
1742 host_process_local_min_max_coord_total_weight(kk);
1745 uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1746 coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].
id;
1747 auto set_min = coord_dim_mins[coordInd];
1748 auto set_max = coord_dim_maxs[coordInd];
1749 Kokkos::parallel_for(
1750 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1751 (0, 1), KOKKOS_LAMBDA (
int dummy) {
1752 local_process_local_min_max_coord_total_weight(kk) = set_min;
1753 local_process_local_min_max_coord_total_weight(
1754 kk + current_concurrent_num_parts) = set_max;
1757 mj_current_dim_coords =
1758 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1761 Kokkos::View<mj_scalar_t *, device_t> coords =
1762 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1763 this->mj_get_local_min_max_coord_totW(
1765 current_concurrent_num_parts,
1771 if(actual_work_part_count > 0) {
1773 this->mj_get_global_min_max_coord_totW(
1774 current_concurrent_num_parts,
1775 this->process_local_min_max_coord_total_weight,
1776 this->global_min_max_coord_total_weight);
1779 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
1780 global_min_max_coord_total_weight);
1784 mj_part_t total_incomplete_cut_count = 0;
1789 mj_part_t concurrent_part_cut_shift = 0;
1790 mj_part_t concurrent_part_part_shift = 0;
1791 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1792 mj_scalar_t min_coordinate =
1793 host_global_min_max_coord_total_weight(kk);
1794 mj_scalar_t max_coordinate = host_global_min_max_coord_total_weight(
1795 kk + current_concurrent_num_parts);
1796 mj_scalar_t global_total_weight = host_global_min_max_coord_total_weight(
1797 kk + 2*current_concurrent_num_parts);
1799 mj_part_t concurrent_current_part_index = current_work_part + kk;
1801 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1802 concurrent_current_part_index);
1804 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
1805 Kokkos::subview(current_cut_coordinates,
1806 std::pair<mj_lno_t, mj_lno_t>(
1807 concurrent_part_cut_shift,
1808 current_cut_coordinates.size()));
1809 Kokkos::View<mj_scalar_t *, device_t>
1810 current_target_part_weights =
1811 Kokkos::subview(target_part_weights,
1812 std::pair<mj_lno_t, mj_lno_t>(
1813 concurrent_part_part_shift,
1814 target_part_weights.size()));
1817 concurrent_part_cut_shift += partition_count - 1;
1819 concurrent_part_part_shift += partition_count;
1822 if(partition_count > 1 && min_coordinate <= max_coordinate) {
1825 total_incomplete_cut_count += partition_count - 1;
1827 this->incomplete_cut_count(kk) = partition_count - 1;
1835 this->mj_get_initial_cut_coords_target_weights(
1838 partition_count - 1,
1839 global_total_weight,
1841 current_target_part_weights,
1842 future_num_part_in_parts,
1843 next_future_num_parts_in_parts,
1844 concurrent_current_part_index,
1845 obtained_part_index,
1846 rd == 0 ? this->num_first_level_parts : 1,
1847 this->first_level_distribution);
1849 mj_lno_t coordinate_end_index =
1850 host_part_xadj(concurrent_current_part_index);
1851 mj_lno_t coordinate_begin_index =
1852 (concurrent_current_part_index==0) ? 0 :
1853 host_part_xadj[concurrent_current_part_index - 1];
1856 this->set_initial_coordinate_parts(
1859 coordinate_begin_index, coordinate_end_index,
1860 this->coordinate_permutations,
1861 mj_current_dim_coords,
1862 this->assigned_part_ids,
1868 this->incomplete_cut_count(kk) = 0;
1870 obtained_part_index += partition_count;
1875 double used_imbalance = 0;
1879 mj_timer_base_string +
"mj_1D_part()");
1882 mj_current_dim_coords,
1885 current_concurrent_num_parts,
1886 current_cut_coordinates,
1887 total_incomplete_cut_count,
1888 view_rectilinear_cut_count,
1889 view_total_reduction_size);
1892 mj_timer_base_string +
"mj_1D_part()");
1895 obtained_part_index += current_concurrent_num_parts;
1899 mj_part_t output_array_shift = 0;
1900 mj_part_t cut_shift = 0;
1901 size_t tlr_shift = 0;
1902 size_t partweight_array_shift = 0;
1904 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1905 mj_part_t current_concurrent_work_part = current_work_part + kk;
1907 mj_part_t num_parts = host_num_partitioning_in_current_dim(
1908 current_concurrent_work_part);
1911 int coordinateA_bigger_than_coordinateB =
1912 host_global_min_max_coord_total_weight(kk) >
1913 host_global_min_max_coord_total_weight(
1914 kk + current_concurrent_num_parts);
1916 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
1919 auto local_new_part_xadj = this->new_part_xadj;
1920 Kokkos::parallel_for(
1921 Kokkos::RangePolicy<
typename mj_node_t::execution_space,
1922 mj_part_t> (0, num_parts), KOKKOS_LAMBDA(mj_part_t jj) {
1923 local_new_part_xadj(
1924 output_part_index + output_array_shift + jj) = 0;
1927 cut_shift += num_parts - 1;
1928 tlr_shift += (4 *(num_parts - 1) + 1);
1929 output_array_shift += num_parts;
1930 partweight_array_shift += (2 * (num_parts - 1) + 1);
1933 mj_lno_t coordinate_end =
1934 host_part_xadj(current_concurrent_work_part);
1935 mj_lno_t coordinate_begin =
1936 current_concurrent_work_part==0 ? 0 :
1937 host_part_xadj(current_concurrent_work_part-1);
1939 Kokkos::View<mj_scalar_t *, device_t>
1940 current_concurrent_cut_coordinate =
1941 Kokkos::subview(current_cut_coordinates,
1942 std::pair<mj_lno_t, mj_lno_t>(
1944 current_cut_coordinates.size()));
1945 Kokkos::View<mj_scalar_t *, device_t>
1946 used_local_cut_line_weight_to_left =
1947 Kokkos::subview(process_cut_line_weight_to_put_left,
1948 std::pair<mj_lno_t, mj_lno_t>(
1950 process_cut_line_weight_to_put_left.size()));
1952 this->thread_part_weight_work =
1954 this->thread_part_weights,
1955 std::pair<mj_lno_t, mj_lno_t>(
1956 partweight_array_shift,
1957 this->thread_part_weights.size()));
1961 Kokkos::View<mj_lno_t *, device_t> subview_new_part_xadj =
1962 Kokkos::subview(this->new_part_xadj,
1963 std::pair<mj_lno_t, mj_lno_t>(
1964 output_part_index + output_array_shift,
1965 this->new_part_xadj.size()));
1967 this->create_consistent_chunks(
1969 mj_current_dim_coords,
1970 current_concurrent_cut_coordinate,
1973 used_local_cut_line_weight_to_left,
1974 subview_new_part_xadj,
1976 partition_along_longest_dim,
1977 p_coord_dimension_range_sorted);
1982 mj_lno_t part_size = coordinate_end - coordinate_begin;
1984 auto local_new_part_xadj = this->new_part_xadj;
1985 Kokkos::parallel_for(
1986 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1987 (0, 1), KOKKOS_LAMBDA (
int dummy) {
1988 local_new_part_xadj(output_part_index + output_array_shift)
1992 auto subview_new_coordinate_permutations =
1993 Kokkos::subview(this->new_coordinate_permutations,
1994 std::pair<mj_lno_t, mj_lno_t>(
1996 coordinate_begin + part_size));
1997 auto subview_coordinate_permutations =
1998 Kokkos::subview(this->coordinate_permutations,
1999 std::pair<mj_lno_t, mj_lno_t>(
2001 coordinate_begin + part_size));
2002 Kokkos::deep_copy(subview_new_coordinate_permutations,
2003 subview_coordinate_permutations);
2006 cut_shift += num_parts - 1;
2007 tlr_shift += (4 *(num_parts - 1) + 1);
2008 output_array_shift += num_parts;
2009 partweight_array_shift += (2 * (num_parts - 1) + 1);
2018 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
2019 mj_part_t num_parts =
2020 host_num_partitioning_in_current_dim(current_work_part + kk);
2021 auto local_new_part_xadj = this->new_part_xadj;
2022 auto local_mj_current_dim_coords = mj_current_dim_coords;
2023 auto local_new_coordinate_permutations =
2024 new_coordinate_permutations;
2025 Kokkos::parallel_for(
2026 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t> (
2027 0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
2029 local_new_part_xadj(output_part_index+ii) +=
2030 output_coordinate_end_index;
2033 mj_lno_t coordinate_end =
2034 local_new_part_xadj(output_part_index+ii);
2035 mj_lno_t coordinate_begin =
2036 local_new_part_xadj(output_part_index);
2038 for(mj_lno_t task_traverse = coordinate_begin;
2039 task_traverse < coordinate_end; ++task_traverse) {
2040 mj_lno_t l = local_new_coordinate_permutations(task_traverse);
2042 local_mj_current_dim_coords(l) = -local_mj_current_dim_coords(l);
2048 mj_part_t get_single;
2049 Kokkos::parallel_reduce(
"Read new_part_xadj",
2050 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
2051 KOKKOS_LAMBDA(
int dummy, mj_part_t & set_single) {
2052 set_single = local_new_part_xadj(output_part_index + num_parts - 1);
2055 output_coordinate_end_index = get_single;
2057 output_part_index += num_parts;
2064 current_num_parts = output_part_count_in_dimension;
2067 Kokkos::View<mj_lno_t *, device_t> tmp = this->coordinate_permutations;
2068 this->coordinate_permutations = this->new_coordinate_permutations;
2069 this->new_coordinate_permutations = tmp;
2071 this->part_xadj = this->new_part_xadj;
2072 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2073 Kokkos::deep_copy(host_part_xadj, part_xadj);
2074 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
"empty", 0);
2077 Kokkos::deep_copy(initial_adjList_output_adjlist, coordinate_permutations);
2081 for(
size_t i = 0; i < this->num_global_parts ; ++i) {
2082 output_xadj[i+1] = host_part_xadj(i);
2085 delete future_num_part_in_parts;
2086 delete next_future_num_parts_in_parts;
2092template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2093 typename mj_part_t,
typename mj_node_t>
2095 <mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,mj_node_t>::mj_partBox_t>
2099 return this->global_box;
2104template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2105 typename mj_part_t,
typename mj_node_t>
2106void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2107 mj_node_t>::set_to_keep_part_boxes()
2109 this->mj_keep_part_boxes =
true;
2119template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2120 typename mj_part_t,
typename mj_node_t>
2124 this->total_num_cut = 0;
2125 this->total_num_part = 1;
2126 this->max_num_part_along_dim = 0;
2127 this->total_dim_num_reduce_all = 0;
2128 this->last_dim_num_part = 1;
2131 this->max_num_cut_along_dim = 0;
2132 this->max_num_total_part_along_dim = 0;
2134 if(this->part_no_array.size()) {
2135 auto local_recursion_depth = this->recursion_depth;
2137 this->total_dim_num_reduce_all =
2138 this->total_num_part * this->recursion_depth;
2140 this->total_num_part = 1;
2141 for(
int i = 0; i < local_recursion_depth; ++i) {
2142 this->total_num_part *= this->part_no_array(i);
2145 mj_part_t track_max = 0;
2146 for(
int i = 0; i < local_recursion_depth; ++i) {
2147 if(part_no_array(i) > track_max) {
2148 track_max = this->part_no_array(i);
2152 this->last_dim_num_part = this->total_num_part /
2153 this->part_no_array(local_recursion_depth-1);
2155 this->max_num_part_along_dim = track_max;
2156 this->num_global_parts = this->total_num_part;
2158 mj_part_t future_num_parts = this->num_global_parts;
2162 if (this->first_level_distribution.size() != 0 &&
2163 this->num_first_level_parts > 1) {
2164 this->max_num_part_along_dim = this->num_first_level_parts;
2169 for(
int rd = 0; rd < this->recursion_depth; ++rd) {
2170 mj_part_t maxNoPartAlongI = 0;
2171 mj_part_t nfutureNumParts = 0;
2177 this->first_level_distribution.size() != 0 &&
2178 this->num_first_level_parts > 1) {
2180 maxNoPartAlongI = this->num_first_level_parts;
2181 this->max_num_part_along_dim = this->num_first_level_parts;
2183 mj_part_t sum_first_level_dist = 0;
2184 mj_part_t max_part = 0;
2187 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2188 sum_first_level_dist += this->first_level_distribution(i);
2189 if (this->first_level_distribution(i) > max_part)
2190 max_part = this->first_level_distribution(i);
2196 this->num_global_parts * max_part / sum_first_level_dist;
2200 maxNoPartAlongI = this->get_part_count(future_num_parts,
2201 1.0f / (this->recursion_depth - rd));
2202 if (maxNoPartAlongI > this->max_num_part_along_dim)
2203 this->max_num_part_along_dim = maxNoPartAlongI;
2204 nfutureNumParts = future_num_parts / maxNoPartAlongI;
2205 if (future_num_parts % maxNoPartAlongI) {
2209 future_num_parts = nfutureNumParts;
2211 this->total_num_part = this->num_global_parts;
2213 if(this->divide_to_prime_first) {
2214 this->total_dim_num_reduce_all = this->num_global_parts * 2;
2215 this->last_dim_num_part = this->num_global_parts;
2222 for(
int i = 0; i < this->recursion_depth; ++i) {
2223 this->total_dim_num_reduce_all += p;
2224 p *= this->max_num_part_along_dim;
2227 if(p / this->max_num_part_along_dim > this->num_global_parts) {
2228 this->last_dim_num_part = this->num_global_parts;
2231 this->last_dim_num_part = p / this->max_num_part_along_dim;
2236 this->total_num_cut = this->total_num_part - 1;
2237 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2238 this->max_num_total_part_along_dim = this->max_num_part_along_dim +
2239 size_t(this->max_num_cut_along_dim);
2244 if(this->max_concurrent_part_calculation > this->last_dim_num_part) {
2245 if(this->mj_problemComm->getRank() == 0) {
2246 std::cerr <<
"Warning: Concurrent part count (" <<
2247 this->max_concurrent_part_calculation <<
2248 ") has been set bigger than maximum amount that can be used." <<
2249 " Setting to:" << this->last_dim_num_part <<
"." << std::endl;
2251 this->max_concurrent_part_calculation = this->last_dim_num_part;
2260template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2261 typename mj_part_t,
typename mj_node_t>
2262inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2263 get_part_count(mj_part_t num_total_future,
double root)
2265 double fp = pow(num_total_future, root);
2266 mj_part_t ip = mj_part_t(fp);
2267 if(fp - ip < std::numeric_limits<float>::epsilon() * 100) {
2294template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2295 typename mj_part_t,
typename mj_node_t>
2296mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2297 update_part_num_arrays(
2298 std::vector<mj_part_t> *future_num_part_in_parts,
2299 std::vector<mj_part_t> *next_future_num_parts_in_parts,
2300 mj_part_t &future_num_parts,
2301 mj_part_t current_num_parts,
2302 int current_iteration,
2303 RCP<mj_partBoxVector_t> input_part_boxes,
2304 RCP<mj_partBoxVector_t> output_part_boxes,
2305 mj_part_t atomic_part_count)
2307 std::vector<mj_part_t> num_partitioning_in_current_dim;
2310 mj_part_t output_num_parts = 0;
2311 if(this->part_no_array.size()) {
2315 mj_part_t current_part_no_array =
2316 this->part_no_array(current_iteration);
2318 if(current_part_no_array < 1) {
2319 std::cout <<
"Current recursive iteration: " << current_iteration <<
2320 " part_no_array[" << current_iteration <<
"] is given as:" <<
2321 current_part_no_array << std::endl;
2324 if(current_part_no_array == 1) {
2325 return current_num_parts;
2329 if (this->first_level_distribution.size() != 0 &&
2330 current_iteration == 0 &&
2331 current_part_no_array != this->num_first_level_parts) {
2332 std::cout <<
"Current recursive iteration: " << current_iteration
2333 <<
" part_no_array[" << current_iteration <<
"] is given as: " <<
2334 current_part_no_array <<
" and contradicts num_first_level_parts: " <<
2335 this->num_first_level_parts << std::endl;
2339 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2340 num_partitioning_in_current_dim.push_back(current_part_no_array);
2357 future_num_parts /= num_partitioning_in_current_dim[0];
2358 output_num_parts = current_num_parts *
2359 num_partitioning_in_current_dim[0];
2360 if(this->mj_keep_part_boxes) {
2361 for(mj_part_t k = 0; k < current_num_parts; ++k) {
2363 for(mj_part_t j = 0; j <
2364 num_partitioning_in_current_dim[0]; ++j) {
2365 output_part_boxes->push_back((*input_part_boxes)[k]);
2373 for(mj_part_t ii = 0; ii < output_num_parts; ++ii) {
2374 next_future_num_parts_in_parts->push_back(future_num_parts);
2384 future_num_parts = 1;
2387 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2389 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2393 mj_part_t num_partitions_in_current_dim =
2394 this->get_part_count(future_num_parts_of_part_ii,
2395 1.0 / (this->recursion_depth - current_iteration)
2397 if(num_partitions_in_current_dim > this->max_num_part_along_dim) {
2398 std::cerr <<
"ERROR: maxPartNo calculation is wrong."
2399 " num_partitions_in_current_dim: "
2400 << num_partitions_in_current_dim <<
" this->max_num_part_along_dim: "
2401 << this->max_num_part_along_dim <<
2402 " this->recursion_depth: " << this->recursion_depth <<
2403 " current_iteration:" << current_iteration <<
2404 " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2405 " might need to fix max part no calculation for "
2406 "largest_prime_first partitioning." <<
2418 if (current_iteration == 0 &&
2419 this->first_level_distribution.size() != 0 &&
2420 this->num_first_level_parts > 1) {
2423 num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2426 output_num_parts = this->num_first_level_parts;
2429 future_num_parts /= this->num_first_level_parts;
2431 mj_part_t max_part = 0;
2432 mj_part_t sum_first_level_dist = 0;
2436 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2437 sum_first_level_dist += this->first_level_distribution(i);
2439 if (this->first_level_distribution(i) > max_part)
2440 max_part = this->first_level_distribution(i);
2444 future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2448 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2449 next_future_num_parts_in_parts->push_back(this->first_level_distribution(i) *
2450 this->num_global_parts / sum_first_level_dist);
2453 else if (this->divide_to_prime_first) {
2455 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2457 mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2460 output_num_parts += num_partitions_in_current_dim;
2462 if (future_num_parts_of_part_ii == atomic_part_count ||
2463 future_num_parts_of_part_ii % atomic_part_count != 0) {
2464 atomic_part_count = 1;
2467 largest_prime_factor =
2468 this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2475 if (largest_prime_factor < num_partitions_in_current_dim) {
2476 largest_prime_factor = num_partitions_in_current_dim;
2479 mj_part_t ideal_num_future_parts_in_part =
2480 (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2482 mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2490 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2492 mj_part_t my_ideal_primescale = ideal_prime_scale;
2494 if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2495 ++my_ideal_primescale;
2498 mj_part_t num_future_parts_for_part_iii =
2499 ideal_num_future_parts_in_part * my_ideal_primescale;
2502 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2504 ++num_future_parts_for_part_iii;
2507 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2510 if (this->mj_keep_part_boxes) {
2511 output_part_boxes->push_back((*input_part_boxes)[ii]);
2515 if (num_future_parts_for_part_iii > future_num_parts)
2516 future_num_parts = num_future_parts_for_part_iii;
2522 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2525 output_num_parts += num_partitions_in_current_dim;
2527 if((future_num_parts_of_part_ii == atomic_part_count) ||
2528 (future_num_parts_of_part_ii % atomic_part_count != 0)) {
2529 atomic_part_count = 1;
2532 mj_part_t ideal_num_future_parts_in_part =
2533 (future_num_parts_of_part_ii / atomic_part_count) /
2534 num_partitions_in_current_dim;
2535 for(mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2536 mj_part_t num_future_parts_for_part_iii =
2537 ideal_num_future_parts_in_part;
2540 if(iii < (future_num_parts_of_part_ii / atomic_part_count) %
2541 num_partitions_in_current_dim) {
2543 ++num_future_parts_for_part_iii;
2546 next_future_num_parts_in_parts->push_back(
2547 num_future_parts_for_part_iii * atomic_part_count);
2551 if(this->mj_keep_part_boxes) {
2552 output_part_boxes->push_back((*input_part_boxes)[ii]);
2555 if(num_future_parts_for_part_iii > future_num_parts)
2556 future_num_parts = num_future_parts_for_part_iii;
2562 device_num_partitioning_in_current_dim = Kokkos::View<
2563 mj_part_t*, device_t>(
"test", num_partitioning_in_current_dim.size());
2564 host_num_partitioning_in_current_dim =
2565 Kokkos::create_mirror_view(device_num_partitioning_in_current_dim);
2566 for(
size_t n = 0; n < num_partitioning_in_current_dim.size(); ++n) {
2567 host_num_partitioning_in_current_dim(n) =
2568 num_partitioning_in_current_dim[n];
2573 Kokkos::deep_copy(device_num_partitioning_in_current_dim,
2574 host_num_partitioning_in_current_dim);
2575 return output_num_parts;
2580template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2581 typename mj_part_t,
typename mj_node_t>
2582void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2583 allocate_set_work_memory()
2588 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2589 Kokkos::ViewAllocateWithoutInitializing(
"coordinate_permutations"),
2590 this->num_local_coords);
2591 auto local_coordinate_permutations = coordinate_permutations;
2592 Kokkos::parallel_for(
2593 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
2594 0, this->num_local_coords), KOKKOS_LAMBDA (mj_lno_t i) {
2595 local_coordinate_permutations(i) = i;
2599 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2600 Kokkos::ViewAllocateWithoutInitializing(
"num_local_coords"),
2601 this->num_local_coords);
2603 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2604 Kokkos::ViewAllocateWithoutInitializing(
"assigned parts"), 0);
2605 if(this->num_local_coords > 0) {
2606 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2607 Kokkos::ViewAllocateWithoutInitializing(
"assigned part ids"),
2608 this->num_local_coords);
2615 this->part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2616 Kokkos::ViewAllocateWithoutInitializing(
"part xadj"), 1);
2617 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2618 host_part_xadj(0) = num_local_coords;
2619 Kokkos::deep_copy(this->part_xadj, host_part_xadj);
2622 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2623 Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2626 this->all_cut_coordinates = Kokkos::View<mj_scalar_t*, device_t>(
2627 Kokkos::ViewAllocateWithoutInitializing(
"all cut coordinates"),
2628 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2631 this->process_cut_line_weight_to_put_left = Kokkos::View<mj_scalar_t*,
2632 device_t>(Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2636 this->thread_cut_line_weight_to_put_left =
2637 Kokkos::View<mj_scalar_t*, device_t>(
2638 Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2640 if(this->distribute_points_on_cut_lines) {
2641 this->process_cut_line_weight_to_put_left =
2642 Kokkos::View<mj_scalar_t *, device_t>(
2643 Kokkos::ViewAllocateWithoutInitializing(
2644 "process_cut_line_weight_to_put_left"),
2645 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2646 this->thread_cut_line_weight_to_put_left =
2647 Kokkos::View<mj_scalar_t *, device_t>(
2648 Kokkos::ViewAllocateWithoutInitializing(
2649 "thread_cut_line_weight_to_put_left"),
2650 this->max_num_cut_along_dim);
2651 this->process_rectilinear_cut_weight =
2652 Kokkos::View<mj_scalar_t *, device_t>(
2653 Kokkos::ViewAllocateWithoutInitializing(
"process_rectilinear_cut_weight"),
2654 this->max_num_cut_along_dim);
2655 this->global_rectilinear_cut_weight =
2656 Kokkos::View<mj_scalar_t *, device_t>(
2657 Kokkos::ViewAllocateWithoutInitializing(
"global_rectilinear_cut_weight"),
2658 this->max_num_cut_along_dim);
2665 this->cut_coordinates_work_array =
2666 Kokkos::View<mj_scalar_t *, device_t>(
2667 Kokkos::ViewAllocateWithoutInitializing(
"cut_coordinates_work_array"),
2668 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2671 this->target_part_weights = Kokkos::View<mj_scalar_t*, device_t>(
2672 Kokkos::ViewAllocateWithoutInitializing(
"target_part_weights"),
2673 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2676 this->cut_upper_bound_coordinates =
2677 Kokkos::View<mj_scalar_t*, device_t>(
2678 Kokkos::ViewAllocateWithoutInitializing(
"cut_upper_bound_coordinates"),
2679 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2682 this->cut_lower_bound_coordinates =
2683 Kokkos::View<mj_scalar_t*, device_t>(
2684 Kokkos::ViewAllocateWithoutInitializing(
"cut_lower_bound_coordinates"),
2685 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2688 this->cut_lower_bound_weights =
2689 Kokkos::View<mj_scalar_t*, device_t>(
2690 Kokkos::ViewAllocateWithoutInitializing(
"cut_lower_bound_weights"),
2691 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2694 this->cut_upper_bound_weights =
2695 Kokkos::View<mj_scalar_t*, device_t>(
2696 Kokkos::ViewAllocateWithoutInitializing(
"cut_upper_bound_weights"),
2697 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2701 this->process_local_min_max_coord_total_weight =
2702 Kokkos::View<mj_scalar_t*, device_t>(
2703 Kokkos::ViewAllocateWithoutInitializing(
2704 "process_local_min_max_coord_total_weight"),
2705 3 * this->max_concurrent_part_calculation);
2708 this->global_min_max_coord_total_weight =
2709 Kokkos::View<mj_scalar_t*, device_t>(
2710 Kokkos::ViewAllocateWithoutInitializing(
"global_min_max_coord_total_weight"),
2711 3 * this->max_concurrent_part_calculation);
2716 this->is_cut_line_determined = Kokkos::View<bool *, device_t>(
2717 Kokkos::ViewAllocateWithoutInitializing(
"is_cut_line_determined"),
2718 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2724 this->device_incomplete_cut_count = Kokkos::View<mj_part_t *, device_t>(
2725 Kokkos::ViewAllocateWithoutInitializing(
"device_incomplete_cut_count"),
2726 this->max_concurrent_part_calculation);
2727 this->incomplete_cut_count =
2728 Kokkos::create_mirror_view(device_incomplete_cut_count);
2731 this->thread_part_weights = Kokkos::View<double *, device_t>(
2732 Kokkos::ViewAllocateWithoutInitializing(
"thread_part_weights"),
2733 this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2735 this->thread_cut_left_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2736 Kokkos::ViewAllocateWithoutInitializing(
"thread_cut_left_closest_point"),
2737 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2741 this->thread_cut_right_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2742 Kokkos::ViewAllocateWithoutInitializing(
"thread_cut_right_closest_point"),
2743 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2746 this->thread_point_counts = Kokkos::View<mj_lno_t *, device_t>(
2747 Kokkos::ViewAllocateWithoutInitializing(
"thread_point_counts"),
2748 this->max_num_part_along_dim);
2754 this->total_part_weight_left_right_closests =
2755 Kokkos::View<mj_scalar_t*, device_t>(
2756 Kokkos::ViewAllocateWithoutInitializing(
2757 "total_part_weight_left_right_closests"),
2758 (this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) *
2759 this->max_concurrent_part_calculation);
2761 this->global_total_part_weight_left_right_closests =
2762 Kokkos::View<mj_scalar_t*, device_t>(
2763 Kokkos::ViewAllocateWithoutInitializing(
2764 "global_total_part_weight_left_right_closests"),
2765 (this->max_num_total_part_along_dim +
2766 this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2768 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
2769 Kokkos::ViewAllocateWithoutInitializing(
"gids"), num_local_coords);
2771 this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>(
2772 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
2778 Kokkos::deep_copy(owner_of_coordinate, myActualRank);
2780 auto local_current_mj_gnos = current_mj_gnos;
2781 auto local_initial_mj_gnos = initial_mj_gnos;
2782 Kokkos::parallel_for(
2783 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2784 (0, num_local_coords), KOKKOS_LAMBDA (mj_lno_t j) {
2785 local_current_mj_gnos(j) = local_initial_mj_gnos(j);
2791template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2792 typename mj_part_t,
typename mj_node_t>
2793void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,
2794 mj_node_t>::compute_global_box()
2797 mj_scalar_t *mins =
new mj_scalar_t[this->coord_dim];
2799 mj_scalar_t *gmins =
new mj_scalar_t[this->coord_dim];
2801 mj_scalar_t *maxs =
new mj_scalar_t[this->coord_dim];
2803 mj_scalar_t *gmaxs =
new mj_scalar_t[this->coord_dim];
2805 auto local_mj_coordinates = this->mj_coordinates;
2809 for(
int i = 0; i < this->coord_dim; ++i) {
2814 for(
int i = 0; i < std::min(this->recursion_depth, this->coord_dim); ++i) {
2815 Kokkos::parallel_reduce(
"MinReduce",
2816 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2817 (0, this->num_local_coords),
2818 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_min) {
2819 if(local_mj_coordinates(j,i) < running_min) {
2820 running_min = local_mj_coordinates(j,i);
2822 }, Kokkos::Min<mj_scalar_t>(mins[i]));
2823 Kokkos::parallel_reduce(
"MaxReduce",
2824 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2825 (0, this->num_local_coords),
2826 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_max) {
2827 if(local_mj_coordinates(j,i) > running_max) {
2828 running_max = local_mj_coordinates(j,i);
2830 }, Kokkos::Max<mj_scalar_t>(maxs[i]));
2833 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2834 this->coord_dim, mins, gmins
2837 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2838 this->coord_dim, maxs, gmaxs
2842 global_box = rcp(
new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2856template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2857 typename mj_part_t,
typename mj_node_t>
2858void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2859 mj_node_t>::init_part_boxes(
2860 RCP<mj_partBoxVector_t> & initial_partitioning_boxes)
2862 mj_partBox_t tmp_box(*global_box);
2863 initial_partitioning_boxes->push_back(tmp_box);
2870template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2873void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2874 mj_get_local_min_max_coord_totW(
2875 mj_part_t current_work_part,
2876 mj_part_t current_concurrent_num_parts,
2877 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords)
2879 auto local_coordinate_permutations = this->coordinate_permutations;
2880 auto local_process_local_min_max_coord_total_weight =
2881 this->process_local_min_max_coord_total_weight;
2882 auto local_mj_weights = this->mj_weights;
2884 bool bUniformWeights = mj_uniform_weights(0);
2886 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
2888 mj_part_t concurrent_current_part = current_work_part + kk;
2889 mj_lno_t coordinate_begin_index = concurrent_current_part == 0 ? 0 :
2890 host_part_xadj(concurrent_current_part - 1);
2891 mj_lno_t coordinate_end_index =
2892 host_part_xadj(concurrent_current_part);
2894 mj_scalar_t my_min_coord = 0;
2895 mj_scalar_t my_max_coord = 0;
2896 mj_scalar_t my_total_weight;
2899 if(coordinate_begin_index >= coordinate_end_index)
2901 my_min_coord = std::numeric_limits<mj_scalar_t>::max();
2902 my_max_coord = -std::numeric_limits<mj_scalar_t>::max();
2903 my_total_weight = 0;
2907 Kokkos::parallel_reduce(
"get min",
2908 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2909 (coordinate_begin_index, coordinate_end_index),
2910 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_min) {
2911 int i = local_coordinate_permutations(j);
2912 if(mj_current_dim_coords(i) < running_min)
2913 running_min = mj_current_dim_coords(i);
2914 }, Kokkos::Min<mj_scalar_t>(my_min_coord));
2916 Kokkos::parallel_reduce(
"get max",
2917 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2918 (coordinate_begin_index, coordinate_end_index),
2919 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_max) {
2920 int i = local_coordinate_permutations(j);
2921 if(mj_current_dim_coords(i) > running_max)
2922 running_max = mj_current_dim_coords(i);
2923 }, Kokkos::Max<mj_scalar_t>(my_max_coord));
2924 if(bUniformWeights) {
2925 my_total_weight = coordinate_end_index - coordinate_begin_index;
2928 my_total_weight = 0;
2929 Kokkos::parallel_reduce(
"get weight",
2930 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2931 (coordinate_begin_index, coordinate_end_index),
2932 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & lsum) {
2933 int i = local_coordinate_permutations(j);
2934 lsum += local_mj_weights(i,0);
2935 }, my_total_weight);
2940 Kokkos::parallel_for(
2941 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2942 (0, 1), KOKKOS_LAMBDA (
int dummy) {
2943 local_process_local_min_max_coord_total_weight(kk) =
2945 local_process_local_min_max_coord_total_weight(
2946 kk + current_concurrent_num_parts) = my_max_coord;
2947 local_process_local_min_max_coord_total_weight(
2948 kk + 2*current_concurrent_num_parts) = my_total_weight;
2965template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2966 typename mj_part_t,
typename mj_node_t>
2967void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2968 mj_node_t>::mj_get_global_min_max_coord_totW(
2969 mj_part_t current_concurrent_num_parts,
2970 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
2971 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total) {
2975 if(this->comm->getSize() > 1) {
2978 auto host_local_min_max_total =
2979 Kokkos::create_mirror_view(Kokkos::HostSpace(), local_min_max_total);
2980 auto host_global_min_max_total =
2981 Kokkos::create_mirror_view(Kokkos::HostSpace(), global_min_max_total);
2982 Kokkos::deep_copy(host_local_min_max_total, local_min_max_total);
2984 reductionOp(current_concurrent_num_parts,
2985 current_concurrent_num_parts, current_concurrent_num_parts);
2987 reduceAll<int, mj_scalar_t>(
2990 3 * current_concurrent_num_parts,
2991 host_local_min_max_total.data(),
2992 host_global_min_max_total.data());
2995 Kokkos::deep_copy(global_min_max_total, host_global_min_max_total);
2998 mj_part_t s = 3 * current_concurrent_num_parts;
2999 Kokkos::parallel_for(
3000 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3001 (0, s), KOKKOS_LAMBDA (mj_part_t i) {
3002 global_min_max_total(i) = local_min_max_total(i);
3039template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3040 typename mj_part_t,
typename mj_node_t>
3041void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3042 mj_get_initial_cut_coords_target_weights(
3043 mj_scalar_t min_coord,
3044 mj_scalar_t max_coord,
3045 mj_part_t num_cuts ,
3046 mj_scalar_t global_weight,
3048 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
3050 Kokkos::View<mj_scalar_t *, device_t> & current_target_part_weights ,
3051 std::vector <mj_part_t> *future_num_part_in_parts,
3052 std::vector <mj_part_t> *next_future_num_parts_in_parts,
3053 mj_part_t concurrent_current_part,
3054 mj_part_t obtained_part_index,
3055 mj_part_t num_target_first_level_parts,
3056 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist)
3058 mj_scalar_t coord_range = max_coord - min_coord;
3065 bool bUniformPartsCheck =
3066 num_target_first_level_parts <= 1 && this->mj_uniform_parts(0);
3068 if(!bUniformPartsCheck) {
3069 bool bValidNonUniformTargetWeights =
3070 (num_target_first_level_parts > 1 && target_first_level_dist.size() != 0);
3071 if(!bValidNonUniformTargetWeights) {
3072 std::cerr <<
"MJ does not support non uniform part weights beyond the first partition" << std::endl;
3077 Kokkos::View<mj_scalar_t*, device_t> device_cumulative(
3078 "device_cumulative", num_cuts);
3079 auto host_cumulative = Kokkos::create_mirror_view(device_cumulative);
3081 mj_scalar_t cumulative = 0;
3083 if(bUniformPartsCheck) {
3085 mj_scalar_t total_future_part_count_in_part =
3086 static_cast<mj_scalar_t
>((*future_num_part_in_parts)[concurrent_current_part]);
3089 mj_scalar_t unit_part_weight =
3090 global_weight / total_future_part_count_in_part;
3092 for(mj_part_t i = 0; i < num_cuts; ++i) {
3093 cumulative += unit_part_weight *
static_cast<mj_scalar_t
>((*next_future_num_parts_in_parts)[i + obtained_part_index]);
3094 host_cumulative(i) = cumulative;
3099 mj_scalar_t sum_target_first_level_dist = 0.0;
3100 for (
int i = 0; i < num_target_first_level_parts; ++i) {
3101 sum_target_first_level_dist += target_first_level_dist(i);
3104 for(mj_part_t i = 0; i < num_cuts; ++i) {
3105 cumulative += global_weight * target_first_level_dist(i) /
3106 sum_target_first_level_dist;
3107 host_cumulative(i) = cumulative;
3111 Kokkos::deep_copy(device_cumulative, host_cumulative);
3113 Kokkos::parallel_for(
"Write num in parts",
3114 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3115 (0, num_cuts), KOKKOS_LAMBDA(mj_part_t cut) {
3117 current_target_part_weights(cut) = device_cumulative(cut);
3118 initial_cut_coords(cut) = min_coord +
3119 (coord_range * device_cumulative(cut)) / global_weight;
3121 current_target_part_weights(num_cuts) = global_weight;
3127 if (!bUniformPartsCheck || this->mj_uniform_weights[0]) {
3128 Kokkos::parallel_for(
3129 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3131 KOKKOS_LAMBDA (mj_part_t i) {
3132 current_target_part_weights(i) =
3133 long(current_target_part_weights(i) + 0.5);
3154template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3155 typename mj_part_t,
typename mj_node_t>
3156void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3157 set_initial_coordinate_parts(
3158 mj_scalar_t &max_coordinate,
3159 mj_scalar_t &min_coordinate,
3160 mj_lno_t coordinate_begin_index,
3161 mj_lno_t coordinate_end_index,
3162 Kokkos::View<mj_lno_t *, device_t> & mj_current_coordinate_permutations,
3163 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3164 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
3165 mj_part_t &partition_count)
3167 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3171 if(std::abs(coordinate_range) < this->sEpsilon ) {
3172 Kokkos::parallel_for(
3173 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3174 (coordinate_begin_index, coordinate_end_index),
3175 KOKKOS_LAMBDA (mj_lno_t ii) {
3176 mj_part_ids(mj_current_coordinate_permutations[ii]) = 0;
3182 mj_scalar_t slice = coordinate_range / partition_count;
3183 Kokkos::parallel_for(
3184 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3185 (coordinate_begin_index, coordinate_end_index),
3186 KOKKOS_LAMBDA (mj_lno_t ii) {
3187 mj_lno_t iii = mj_current_coordinate_permutations[ii];
3189 mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3190 if(pp >= partition_count) {
3191 pp = partition_count - 1;
3193 mj_part_ids[iii] = 2 * pp;
3212template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3213 typename mj_part_t,
typename mj_node_t>
3214void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,mj_node_t>::mj_1D_part(
3215 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3216 double used_imbalance_tolerance,
3217 mj_part_t current_work_part,
3218 mj_part_t current_concurrent_num_parts,
3219 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
3220 mj_part_t total_incomplete_cut_count,
3221 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
3222 Kokkos::View<size_t*, device_t> & view_total_reduction_size)
3224 this->temp_cut_coords = current_cut_coordinates;
3227 *reductionOp = NULL;
3229 bool bSingleProcess = (this->comm->getSize() == 1);
3231 std::vector<mj_part_t> temp(host_num_partitioning_in_current_dim.size());
3232 if(!bSingleProcess) {
3233 for(
size_t n = 0; n < host_num_partitioning_in_current_dim.size(); ++n) {
3234 temp[n] = host_num_partitioning_in_current_dim(n);
3237 <mj_part_t, mj_scalar_t>(
3240 current_concurrent_num_parts);
3243 auto local_cut_lower_bound_coordinates =
3244 cut_lower_bound_coordinates;
3245 auto local_cut_upper_bound_coordinates =
3246 cut_upper_bound_coordinates;
3247 auto local_cut_upper_bound_weights = cut_upper_bound_weights;
3248 auto local_cut_lower_bound_weights = cut_lower_bound_weights;
3249 bool local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
3250 auto local_process_cut_line_weight_to_put_left =
3251 process_cut_line_weight_to_put_left;
3252 auto local_temp_cut_coords = temp_cut_coords;
3253 auto local_global_total_part_weight_left_right_closests =
3254 global_total_part_weight_left_right_closests;
3255 auto local_cut_coordinates_work_array =
3256 cut_coordinates_work_array;
3257 auto local_part_xadj = part_xadj;
3258 auto local_global_min_max_coord_total_weight =
3259 global_min_max_coord_total_weight;
3260 auto local_target_part_weights =
3261 target_part_weights;
3262 auto local_global_rectilinear_cut_weight =
3263 global_rectilinear_cut_weight;
3264 auto local_process_rectilinear_cut_weight =
3265 process_rectilinear_cut_weight;
3267 auto local_is_cut_line_determined = this->is_cut_line_determined;
3268 auto local_device_num_partitioning_in_current_dim =
3269 device_num_partitioning_in_current_dim;
3271 Kokkos::parallel_for(
3272 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3273 KOKKOS_LAMBDA (
int dummy) {
3276 view_rectilinear_cut_count(0) = 0;
3277 view_total_reduction_size(0) = 0;
3281 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3282 mj_part_t num_part_in_dim =
3283 local_device_num_partitioning_in_current_dim(current_work_part + i);
3284 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3285 view_total_reduction_size(0) += (4 * num_cut_in_dim + 1);
3287 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii) {
3288 local_is_cut_line_determined(next) =
false;
3290 local_cut_lower_bound_coordinates(next) =
3291 local_global_min_max_coord_total_weight(i);
3293 local_cut_upper_bound_coordinates(next) =
3294 local_global_min_max_coord_total_weight(
3295 i + current_concurrent_num_parts);
3297 local_cut_upper_bound_weights(next) =
3298 local_global_min_max_coord_total_weight(
3299 i + 2 * current_concurrent_num_parts);
3300 local_cut_lower_bound_weights(next) = 0;
3301 if(local_distribute_points_on_cut_lines) {
3302 local_process_cut_line_weight_to_put_left(next) = 0;
3313 while (total_incomplete_cut_count != 0) {
3314 this->mj_1D_part_get_part_weights(
3315 current_concurrent_num_parts,
3317 mj_current_dim_coords,
3321 this->mj_combine_rightleft_and_weights(
3323 current_concurrent_num_parts);
3326 if(!bSingleProcess) {
3329 auto host_total_part_weight_left_right_closests =
3330 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3331 total_part_weight_left_right_closests);
3332 auto host_global_total_part_weight_left_right_closests =
3333 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3334 global_total_part_weight_left_right_closests);
3336 Kokkos::deep_copy(host_total_part_weight_left_right_closests,
3337 total_part_weight_left_right_closests);
3339 size_t host_view_total_reduction_size;
3340 Kokkos::parallel_reduce(
"Read single",
3341 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3342 KOKKOS_LAMBDA(
int dummy,
size_t & set_single) {
3343 set_single = view_total_reduction_size(0);
3344 }, host_view_total_reduction_size);
3346 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3347 host_view_total_reduction_size,
3348 host_total_part_weight_left_right_closests.data(),
3349 host_global_total_part_weight_left_right_closests.data());
3350 Kokkos::deep_copy(global_total_part_weight_left_right_closests,
3351 host_global_total_part_weight_left_right_closests);
3354 local_global_total_part_weight_left_right_closests =
3355 this->total_part_weight_left_right_closests;
3360 mj_part_t cut_shift = 0;
3364 size_t tlr_shift = 0;
3366 Kokkos::View<mj_part_t*, Kokkos::HostSpace>
3367 save_initial_incomplete_cut_count(
"save_initial_incomplete_cut_count",
3368 current_concurrent_num_parts);
3370 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3372 mj_part_t num_parts =
3373 host_num_partitioning_in_current_dim(current_work_part + kk);
3375 mj_part_t num_cuts = num_parts - 1;
3376 size_t num_total_part = num_parts + size_t (num_cuts);
3381 mj_part_t kk_incomplete_cut_count = this->incomplete_cut_count(kk);
3383 if(kk_incomplete_cut_count == 0) {
3384 cut_shift += num_cuts;
3385 tlr_shift += (num_total_part + 2 * num_cuts);
3389 Kokkos::View<mj_scalar_t *, device_t> current_local_part_weights =
3390 Kokkos::subview(this->total_part_weight_left_right_closests,
3391 std::pair<mj_lno_t, mj_lno_t>(
3393 this->total_part_weight_left_right_closests.size()));
3395 Kokkos::View<mj_scalar_t *, device_t> current_global_tlr =
3397 local_global_total_part_weight_left_right_closests,
3398 std::pair<mj_lno_t, mj_lno_t>(
3400 local_global_total_part_weight_left_right_closests.size()));
3401 Kokkos::View<mj_scalar_t *, device_t>
3402 current_global_left_closest_points =
3403 Kokkos::subview(current_global_tlr,
3404 std::pair<mj_lno_t, mj_lno_t>(
3406 current_global_tlr.size()));
3407 Kokkos::View<mj_scalar_t *, device_t>
3408 current_global_right_closest_points =
3409 Kokkos::subview(current_global_tlr,
3410 std::pair<mj_lno_t, mj_lno_t>(
3411 num_total_part + num_cuts,
3412 current_global_tlr.size()));
3413 Kokkos::View<mj_scalar_t *, device_t> current_global_part_weights =
3416 Kokkos::View<bool *, device_t> current_cut_line_determined =
3417 Kokkos::subview(this->is_cut_line_determined,
3418 std::pair<mj_lno_t, mj_lno_t>(
3420 this->is_cut_line_determined.size()));
3421 Kokkos::View<mj_scalar_t *, device_t> current_part_target_weights =
3422 Kokkos::subview(local_target_part_weights,
3423 std::pair<mj_lno_t, mj_lno_t>(
3425 local_target_part_weights.size()));
3426 Kokkos::View<mj_scalar_t *, device_t>
3427 current_part_cut_line_weight_to_put_left =
3428 Kokkos::subview(local_process_cut_line_weight_to_put_left,
3429 std::pair<mj_lno_t, mj_lno_t>(
3431 local_process_cut_line_weight_to_put_left.size()));
3433 save_initial_incomplete_cut_count(kk) =
3434 kk_incomplete_cut_count;
3436 Kokkos::View<mj_scalar_t *, device_t>
3437 current_cut_lower_bound_weights =
3438 Kokkos::subview(local_cut_lower_bound_weights,
3439 std::pair<mj_lno_t, mj_lno_t>(
3441 local_cut_lower_bound_weights.size()));
3442 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_weights =
3443 Kokkos::subview(local_cut_upper_bound_weights,
3444 std::pair<mj_lno_t, mj_lno_t>(
3446 local_cut_upper_bound_weights.size()));
3447 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_bounds =
3448 Kokkos::subview(local_cut_upper_bound_coordinates,
3449 std::pair<mj_lno_t, mj_lno_t>(
3451 local_cut_upper_bound_coordinates.size()));
3452 Kokkos::View<mj_scalar_t *, device_t> current_cut_lower_bounds =
3453 Kokkos::subview(local_cut_lower_bound_coordinates,
3454 std::pair<mj_lno_t, mj_lno_t>(
3456 local_cut_lower_bound_coordinates.size()));
3459 Kokkos::View<mj_scalar_t*, device_t> sub_temp_cut_coords =
3460 Kokkos::subview(this->temp_cut_coords,
3461 std::pair<mj_lno_t, mj_lno_t>(
3462 cut_shift, this->temp_cut_coords.size()));
3463 Kokkos::View<mj_scalar_t*, device_t> sub_cut_coordinates_work_array =
3464 Kokkos::subview(this->cut_coordinates_work_array,
3465 std::pair<mj_lno_t, mj_lno_t>(
3466 cut_shift, this->cut_coordinates_work_array.size()));
3468 this->mj_get_new_cut_coordinates(
3469 current_concurrent_num_parts,
3472 used_imbalance_tolerance,
3473 current_global_part_weights,
3474 current_local_part_weights,
3475 current_part_target_weights,
3476 current_cut_line_determined,
3477 sub_temp_cut_coords,
3478 current_cut_upper_bounds,
3479 current_cut_lower_bounds,
3480 current_global_left_closest_points,
3481 current_global_right_closest_points,
3482 current_cut_lower_bound_weights,
3483 current_cut_upper_weights,
3484 sub_cut_coordinates_work_array,
3485 current_part_cut_line_weight_to_put_left,
3486 view_rectilinear_cut_count);
3488 cut_shift += num_cuts;
3489 tlr_shift += (num_total_part + 2 * num_cuts);
3492 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3493 mj_part_t iteration_complete_cut_count =
3494 save_initial_incomplete_cut_count(kk) - this->incomplete_cut_count(kk);
3495 total_incomplete_cut_count -= iteration_complete_cut_count;
3498 Kokkos::parallel_for(
3499 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3500 (0, local_temp_cut_coords.size()), KOKKOS_LAMBDA(
int n) {
3501 auto t = local_temp_cut_coords(n);
3502 local_temp_cut_coords(n) = local_cut_coordinates_work_array(n);
3503 local_cut_coordinates_work_array(n) = t;
3511 if(current_cut_coordinates != local_temp_cut_coords) {
3512 Kokkos::parallel_for(
3513 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3514 (0, 1), KOKKOS_LAMBDA(
int dummy) {
3516 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3517 mj_part_t num_parts = -1;
3518 num_parts = local_device_num_partitioning_in_current_dim(
3519 current_work_part + i);
3520 mj_part_t num_cuts = num_parts - 1;
3521 for(mj_part_t ii = 0; ii < num_cuts; ++ii) {
3522 current_cut_coordinates(next + ii) = local_temp_cut_coords(next + ii);
3527 static_cast<int>(local_cut_coordinates_work_array.size()); ++n) {
3528 local_cut_coordinates_work_array(n) = local_temp_cut_coords(n);
3536template<
class scalar_t>
3542 KOKKOS_INLINE_FUNCTION
3545 KOKKOS_INLINE_FUNCTION
3554#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3556template<
class policy_t,
class scalar_t,
class part_t>
3567 scalar_t mj_max_scalar,
3569 int mj_value_count_rightleft,
3570 int mj_value_count_weights) :
3577 KOKKOS_INLINE_FUNCTION
3582 KOKKOS_INLINE_FUNCTION
3585 dst.
ptr[n] += src.
ptr[n];
3590 if(src.
ptr[n] > dst.
ptr[n]) {
3593 if(src.
ptr[n+1] < dst.
ptr[n+1]) {
3594 dst.
ptr[n+1] = src.
ptr[n+1];
3599 KOKKOS_INLINE_FUNCTION
3602 dst.
ptr[n] += src.
ptr[n];
3607 if(src.
ptr[n] > dst.
ptr[n]) {
3610 if(src.
ptr[n+1] < dst.
ptr[n+1]) {
3611 dst.
ptr[n+1] = src.
ptr[n+1];
3632template<
class policy_t,
class scalar_t,
class part_t,
class index_t,
3633 class device_t,
class array_t>
3638#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3661#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3662 Kokkos::View<double *, device_t> current_part_weights;
3663 Kokkos::View<scalar_t *, device_t> current_left_closest;
3664 Kokkos::View<scalar_t *, device_t> current_right_closest;
3669 array_t mj_max_scalar,
3670 part_t mj_concurrent_current_part,
3672 part_t mj_current_work_part,
3673 part_t mj_current_concurrent_num_parts,
3674 part_t mj_left_right_array_size,
3675 part_t mj_weight_array_size,
3676 Kokkos::View<index_t*, device_t> & mj_permutations,
3677 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
3678 Kokkos::View<scalar_t**, device_t> & mj_weights,
3679 Kokkos::View<part_t*, device_t> & mj_parts,
3680 Kokkos::View<scalar_t *, device_t> & mj_cut_coordinates,
3681 Kokkos::View<index_t *, device_t> & mj_part_xadj,
3682 bool mj_uniform_weights0,
3683 scalar_t mj_sEpsilon
3684#
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3685 ,Kokkos::View<double *, device_t> & mj_current_part_weights,
3686 Kokkos::View<scalar_t *, device_t> & mj_current_left_closest,
3687 Kokkos::View<scalar_t *, device_t> & mj_current_right_closest
3698 value_count(mj_weight_array_size+mj_left_right_array_size),
3707#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3708 ,current_part_weights(mj_current_part_weights),
3709 current_left_closest(mj_current_left_closest),
3710 current_right_closest(mj_current_right_closest)
3716#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3717 int result =
sizeof(array_t) *
3720 int result =
sizeof(array_t) *
3725 int remainder = result % 8;
3726 if(remainder != 0) {
3727 result += 8 - remainder;
3732 KOKKOS_INLINE_FUNCTION
3733#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3743 index_t num_working_points = all_end - all_begin;
3744 int num_teams = teamMember.league_size();
3746 index_t stride = num_working_points / num_teams;
3747 if((num_working_points % num_teams) > 0) {
3754 index_t begin = all_begin + stride * teamMember.league_rank();
3755 index_t end = begin + stride;
3760#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3764 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3768 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3778 teamMember.team_barrier();
3780 Kokkos::parallel_for(
3781 Kokkos::TeamThreadRange(teamMember, begin, end),
3788 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3801 Kokkos::parallel_reduce(
3802 Kokkos::TeamThreadRange(teamMember, begin, end),
3811 index_t part =
parts(i)/2;
3822#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3823 Kokkos::atomic_add(&shared_ptr[part*2], w);
3825 threadSum.
ptr[part*2] += w;
3831#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3832 array_t new_value = (array_t) coord;
3834 while(new_value < prev_value) {
3835 prev_value = Kokkos::atomic_compare_exchange(
3837 prev_value, new_value);
3840 while(new_value > prev_value) {
3841 prev_value = Kokkos::atomic_compare_exchange(
3843 prev_value, new_value);
3861 if(coord < b + sEpsilon && coord > b -
sEpsilon) {
3865#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3866 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3870 threadSum.
ptr[part*2+1] += w;
3875 parts(i) = part*2+1;
3886 scalar_t delta = b - base_coord;
3887 if(delta < 0) delta = -delta;
3892#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3893 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3897 threadSum.
ptr[part*2+1] += w;
3908 scalar_t delta = b - base_coord;
3909 if(delta < 0) delta = -delta;
3914#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3915 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3919 threadSum.
ptr[part*2+1] += w;
3944 if(part == lower + 1) {
3949 part -= (part - lower)/2;
3952 else if(part == upper - 1) {
3957 part += (upper - part)/2;
3961#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3964 }, arraySumReducer);
3967 teamMember.team_barrier();
3970 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3972#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3973 Kokkos::atomic_add(¤t_part_weights(n),
3974 static_cast<double>(shared_ptr[n]));
3976 teamSum[n] += array.ptr[n];
3980#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3981 int insert_left = 0;
3982 int insert_right = 0;
3987#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3988 scalar_t new_value = shared_ptr[n+1];
3989 scalar_t prev_value = current_right_closest(insert_right);
3990 while(new_value < prev_value) {
3991 prev_value = Kokkos::atomic_compare_exchange(
3992 ¤t_right_closest(insert_right), prev_value, new_value);
3995 new_value = shared_ptr[n];
3996 prev_value = current_left_closest(insert_left);
3997 while(new_value > prev_value) {
3998 prev_value = Kokkos::atomic_compare_exchange(
3999 ¤t_left_closest(insert_left), prev_value, new_value);
4005 if(array.ptr[n] > teamSum[n]) {
4006 teamSum[n] = array.ptr[n];
4008 if(array.ptr[n+1] < teamSum[n+1]) {
4009 teamSum[n+1] = array.ptr[n+1];
4015 teamMember.team_barrier();
4018#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4019 KOKKOS_INLINE_FUNCTION
4027 if(src[n] > dst[n]) {
4030 if(src[n+1] < dst[n+1]) {
4031 dst[n+1] = src[n+1];
4057template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4058 typename mj_part_t,
typename mj_node_t>
4059void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t,mj_part_t, mj_node_t>::
4060 mj_1D_part_get_part_weights(
4063 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4066 auto local_is_cut_line_determined = is_cut_line_determined;
4067 auto local_thread_part_weights = thread_part_weights;
4068 auto local_thread_cut_left_closest_point = thread_cut_left_closest_point;
4069 auto local_thread_cut_right_closest_point = thread_cut_right_closest_point;
4073 auto local_sEpsilon = this->
sEpsilon;
4074 auto local_assigned_part_ids = this->assigned_part_ids;
4075 auto local_coordinate_permutations = this->coordinate_permutations;
4076 auto local_mj_weights = this->mj_weights;
4078 auto local_global_min_max_coord_total_weight =
4079 this->global_min_max_coord_total_weight;
4081 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4083 auto local_device_num_partitioning_in_current_dim =
4084 device_num_partitioning_in_current_dim;
4086 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
4087 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
4089 mj_part_t total_part_shift = 0;
4091 mj_part_t concurrent_cut_shifts = 0;
4093 Kokkos::View<mj_scalar_t *, device_t> local_temp_cut_coords =
4094 Kokkos::subview(temp_cut_coords, std::pair<mj_lno_t, mj_lno_t>(
4095 concurrent_cut_shifts, temp_cut_coords.size()));
4097 mj_part_t num_parts =
4099 mj_part_t
num_cuts = num_parts - 1;
4100 mj_part_t total_part_count = num_parts +
num_cuts;
4101 mj_part_t weight_array_length =
num_cuts + num_parts;
4104 mj_part_t right_left_array_length = (
num_cuts + 2) * 2;
4106 if(this->incomplete_cut_count(kk) == 0) {
4107 total_part_shift += total_part_count;
4113 auto policy_ReduceWeightsFunctor = policy_t(
4114 mj_num_teams ? mj_num_teams : 60, Kokkos::AUTO);
4116#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4117 int total_array_length =
4118 weight_array_length + right_left_array_length;
4125 typedef mj_scalar_t array_t;
4127#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4128 Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array(
"reduce_array", total_array_length);
4131 int offset_cuts = 0;
4132 for(
int kk2 = 0; kk2 < kk; ++kk2) {
4136 Kokkos::View<double *, device_t> my_current_part_weights =
4137 Kokkos::subview(local_thread_part_weights,
4138 std::pair<mj_lno_t, mj_lno_t>(total_part_shift,
4139 total_part_shift + total_part_count));
4140 Kokkos::View<mj_scalar_t *, device_t> my_current_left_closest =
4141 Kokkos::subview(local_thread_cut_left_closest_point,
4142 std::pair<mj_lno_t, mj_lno_t>(
4144 local_thread_cut_left_closest_point.size()));
4145 Kokkos::View<mj_scalar_t *, device_t> my_current_right_closest =
4146 Kokkos::subview(local_thread_cut_right_closest_point,
4147 std::pair<mj_lno_t, mj_lno_t>(
4149 local_thread_cut_right_closest_point.size()));
4151 array_t
max_scalar = std::numeric_limits<array_t>::max();
4153#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4155 Kokkos::parallel_for(
4156 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4157 KOKKOS_LAMBDA (
int dummy) {
4158 for(
int n = 0; n < weight_array_length; ++n) {
4159 my_current_part_weights(n) = 0;
4161 for(
int n = 0; n <
num_cuts; ++n) {
4172 typename mj_node_t::device_type, array_t>
4180 right_left_array_length,
4181 weight_array_length,
4182 coordinate_permutations,
4183 mj_current_dim_coords,
4186 local_temp_cut_coords,
4188 mj_uniform_weights(0),
4190#
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4191 ,my_current_part_weights,
4192 my_current_left_closest,
4193 my_current_right_closest
4197#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4198 Kokkos::parallel_for(policy_ReduceWeightsFunctor, teamFunctor);
4200 Kokkos::parallel_reduce(policy_ReduceWeightsFunctor,
4201 teamFunctor, reduce_array);
4205#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4206 auto hostArray = Kokkos::create_mirror_view(my_current_part_weights);
4208 for(
int i = 0; i < static_cast<int>(total_part_count); ++i) {
4209 hostArray(i) = reduce_array[i];
4212 Kokkos::deep_copy(my_current_part_weights, hostArray);
4214 auto hostLeftArray = Kokkos::create_mirror_view(my_current_left_closest);
4215 auto hostRightArray = Kokkos::create_mirror_view(my_current_right_closest);
4216 for(mj_part_t cut = 0; cut <
num_cuts; ++cut) {
4217 hostLeftArray(cut) = reduce_array[weight_array_length + (cut+1)*2+0];
4218 hostRightArray(cut) = reduce_array[weight_array_length + (cut+1)*2+1];
4220 Kokkos::deep_copy(my_current_left_closest, hostLeftArray);
4221 Kokkos::deep_copy(my_current_right_closest, hostRightArray);
4224 total_part_shift += total_part_count;
4228 auto local_temp_cut_coords = temp_cut_coords;
4230 Kokkos::parallel_for(
4231 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
4233 mj_part_t num_parts = local_device_num_partitioning_in_current_dim(
4235 mj_part_t
num_cuts = num_parts - 1;
4236 mj_part_t total_part_count = num_parts +
num_cuts;
4238 if(local_device_incomplete_cut_count(kk) > 0) {
4242 size_t offset_cuts = 0;
4243 for(mj_part_t kk2 = 0; kk2 < kk; ++kk2) {
4244 auto num_parts_kk2 = local_device_num_partitioning_in_current_dim(
4246 offset += num_parts_kk2 * 2 - 1;
4247 offset_cuts += num_parts_kk2 - 1;
4250 for(mj_part_t i = 1; i < total_part_count; ++i) {
4254 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
4255 std::abs(local_temp_cut_coords(offset_cuts + i / 2) -
4256 local_temp_cut_coords(offset_cuts + i /2 - 1))
4261 local_thread_part_weights(offset + i)
4262 = local_thread_part_weights(offset + i-2);
4267 local_thread_part_weights(offset + i) +=
4268 local_thread_part_weights(offset + i-1);
4281template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4282 typename mj_part_t,
typename mj_node_t>
4283void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4284 mj_combine_rightleft_and_weights(
4288 auto local_thread_part_weights = this->thread_part_weights;
4289 auto local_is_cut_line_determined = this->is_cut_line_determined;
4290 auto local_thread_cut_left_closest_point =
4291 this->thread_cut_left_closest_point;
4292 auto local_thread_cut_right_closest_point =
4293 this->thread_cut_right_closest_point;
4294 auto local_total_part_weight_left_right_closests =
4295 this->total_part_weight_left_right_closests;
4296 auto local_device_num_partitioning_in_current_dim =
4297 device_num_partitioning_in_current_dim;
4298 Kokkos::parallel_for(
4299 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0,1),
4300 KOKKOS_LAMBDA (
int dummy) {
4302 size_t tlr_array_shift = 0;
4303 mj_part_t cut_shift = 0;
4304 size_t total_part_array_shift = 0;
4310 mj_part_t num_parts_in_part =
4312 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
4313 size_t num_total_part_in_part =
4314 num_parts_in_part + size_t (num_cuts_in_part);
4317 for(
int ii = 0; ii < num_cuts_in_part; ++ii) {
4318 mj_part_t next = tlr_array_shift + ii;
4319 mj_part_t cut_index = cut_shift + ii;
4321 if(!local_is_cut_line_determined(cut_index)) {
4322 mj_scalar_t left_closest_in_process =
4323 local_thread_cut_left_closest_point(cut_index);
4324 mj_scalar_t right_closest_in_process =
4325 local_thread_cut_right_closest_point(cut_index);
4328 local_total_part_weight_left_right_closests(
4329 num_total_part_in_part + next) = left_closest_in_process;
4331 local_total_part_weight_left_right_closests(
4332 num_total_part_in_part + num_cuts_in_part + next) =
4333 right_closest_in_process;
4337 for(
size_t j = 0; j < num_total_part_in_part; ++j) {
4338 mj_part_t cut_ind = j / 2 + cut_shift;
4344 if(j == num_total_part_in_part - 1 ||
4345 !local_is_cut_line_determined(cut_ind)) {
4346 double pwj = local_thread_part_weights(total_part_array_shift + j);
4347 local_total_part_weight_left_right_closests(tlr_array_shift + j) = pwj;
4352 cut_shift += num_cuts_in_part;
4353 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
4354 total_part_array_shift += num_total_part_in_part;
4371template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4372 typename mj_part_t,
typename mj_node_t>
4373KOKKOS_INLINE_FUNCTION
4374void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
4375 mj_node_t>::mj_calculate_new_cut_position(mj_scalar_t cut_upper_bound,
4376 mj_scalar_t cut_lower_bound,
4377 mj_scalar_t cut_upper_weight,
4378 mj_scalar_t cut_lower_weight,
4379 mj_scalar_t expected_weight,
4380 mj_scalar_t &new_cut_position,
4383 if(std::abs(cut_upper_bound - cut_lower_bound) <
sEpsilon) {
4384 new_cut_position = cut_upper_bound;
4387 if(std::abs(cut_upper_weight - cut_lower_weight) <
sEpsilon) {
4388 new_cut_position = cut_lower_bound;
4391 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
4392 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
4393 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
4395 mj_scalar_t required_shift = (my_weight_diff / weight_range);
4396 int scale_constant = 20;
4397 int shiftint= int (required_shift * scale_constant);
4398 if(shiftint == 0) shiftint = 1;
4399 required_shift = mj_scalar_t (shiftint) / scale_constant;
4400 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
4403#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4405template<
class policy_t,
class scalar_t>
4415 int mj_value_count) :
4417 value_count(mj_value_count)
4420 KOKKOS_INLINE_FUNCTION
4425 KOKKOS_INLINE_FUNCTION
4427 for(
int n = 0; n < value_count; ++n) {
4428 dst.
ptr[n] += src.
ptr[n];
4434 for(
int n = 0; n < value_count; ++n) {
4442template<
class policy_t,
class scalar_t,
class part_t,
class index_t,
4443 class device_t,
class array_t>
4448#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4449 typedef array_t value_type[];
4460#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4461 Kokkos::View<int *, device_t> local_point_counts;
4465 part_t mj_concurrent_current_part,
4466 part_t mj_weight_array_size,
4467 Kokkos::View<index_t*, device_t> & mj_permutations,
4468 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
4469 Kokkos::View<part_t*, device_t> & mj_parts,
4470 Kokkos::View<index_t *, device_t> & mj_part_xadj,
4471 Kokkos::View<index_t *, device_t> & mj_track_on_cuts
4472#
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4473 ,Kokkos::View<int *, device_t> & mj_local_point_counts
4476 concurrent_current_part(mj_concurrent_current_part),
4477 value_count(mj_weight_array_size),
4478 permutations(mj_permutations),
4481 part_xadj(mj_part_xadj),
4482 track_on_cuts(mj_track_on_cuts)
4483#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4484 ,local_point_counts(mj_local_point_counts)
4490#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4491 int result =
sizeof(array_t) * (value_count);
4493 int result =
sizeof(array_t) * (value_count) * team_size;
4497 int remainder = result % 8;
4498 if(remainder != 0) {
4499 result += 8 - remainder;
4504 KOKKOS_INLINE_FUNCTION
4505#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4506 void operator() (
const member_type & teamMember)
const {
4508 void operator() (
const member_type & teamMember, value_type teamSum)
const {
4510 index_t all_begin = (concurrent_current_part == 0) ? 0 :
4511 part_xadj(concurrent_current_part - 1);
4512 index_t all_end = part_xadj(concurrent_current_part);
4514 index_t num_working_points = all_end - all_begin;
4515 int num_teams = teamMember.league_size();
4517 index_t stride = num_working_points / num_teams;
4518 if((num_working_points % num_teams) > 0) {
4522 index_t begin = all_begin + stride * teamMember.league_rank();
4523 index_t end = begin + stride;
4528 int track_on_cuts_insert_index = track_on_cuts.size() - 1;
4531#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4532 size_t sh_mem_size =
sizeof(array_t) * (value_count);
4534 size_t sh_mem_size =
4535 sizeof(array_t) * (value_count) * teamMember.team_size();
4538 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
4541#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4543 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4544 for(
int n = 0; n < value_count; ++n) {
4548 teamMember.team_barrier();
4550 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, begin, end),
4560 Kokkos::parallel_reduce(
4561 Kokkos::TeamThreadRange(teamMember, begin, end),
4565 index_t coordinate_index = permutations(ii);
4566 part_t place = parts(coordinate_index);
4568 if(place % 2 == 0) {
4569#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4570 Kokkos::atomic_add(&shared_ptr[part], 1);
4572 threadSum.
ptr[part] += 1;
4575 parts(coordinate_index) = part;
4580 index_t set_index = Kokkos::atomic_fetch_add(
4581 &track_on_cuts(track_on_cuts_insert_index), 1);
4582 track_on_cuts(set_index) = ii;
4584#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4590 teamMember.team_barrier();
4593 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4594 for(
int n = 0; n < value_count; ++n) {
4595#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4596 Kokkos::atomic_add(&local_point_counts(n), shared_ptr[n]);
4598 teamSum[n] += array.ptr[n];
4603 teamMember.team_barrier();
4606#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4608 KOKKOS_INLINE_FUNCTION
4609 void join(value_type dst,
const value_type src)
const {
4610 for(
int n = 0; n < value_count; ++n) {
4615 KOKKOS_INLINE_FUNCTION
void init (value_type dst)
const {
4616 for(
int n = 0; n < value_count; ++n) {
4638template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4639 typename mj_part_t,
typename mj_node_t>
4640void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4641mj_create_new_partitions(
4642 mj_part_t num_parts,
4643 mj_part_t current_concurrent_work_part,
4644 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4645 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
4646 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
4647 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj)
4650 auto local_thread_part_weight_work = this->thread_part_weight_work;
4651 auto local_point_counts = this->thread_point_counts;
4652 auto local_distribute_points_on_cut_lines =
4653 this->distribute_points_on_cut_lines;
4654 auto local_thread_cut_line_weight_to_put_left =
4655 this->thread_cut_line_weight_to_put_left;
4656 auto local_sEpsilon = this->sEpsilon;
4657 auto local_coordinate_permutations = this->coordinate_permutations;
4658 auto local_mj_weights = this->mj_weights;
4659 auto local_assigned_part_ids = this->assigned_part_ids;
4660 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
4662 mj_part_t num_cuts = num_parts - 1;
4664 Kokkos::parallel_for(
4665 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4666 KOKKOS_LAMBDA(
int dummy) {
4668 if(local_distribute_points_on_cut_lines) {
4669 for(
int i = 0; i < num_cuts; ++i) {
4670 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
4671 if(left_weight > local_sEpsilon) {
4673 mj_scalar_t thread_ii_weight_on_cut =
4674 local_thread_part_weight_work(i * 2 + 1) -
4675 local_thread_part_weight_work(i * 2);
4677 if(thread_ii_weight_on_cut < left_weight) {
4679 local_thread_cut_line_weight_to_put_left(i) =
4680 thread_ii_weight_on_cut;
4684 local_thread_cut_line_weight_to_put_left(i) = left_weight;
4686 left_weight -= thread_ii_weight_on_cut;
4689 local_thread_cut_line_weight_to_put_left(i) = 0;
4695 for(mj_part_t i = num_cuts - 1; i > 0 ; --i) {
4696 if(std::abs(current_concurrent_cut_coordinate(i) -
4697 current_concurrent_cut_coordinate(i -1)) < local_sEpsilon) {
4698 local_thread_cut_line_weight_to_put_left(i) -=
4699 local_thread_cut_line_weight_to_put_left(i - 1);
4701 local_thread_cut_line_weight_to_put_left(i) =
4702 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
4703 least_signifiance) * significance_mul) /
4704 static_cast<mj_scalar_t
>(significance_mul);
4708 for(mj_part_t i = 0; i < num_parts; ++i) {
4709 local_point_counts(i) = 0;
4713 mj_lno_t coordinate_begin_index =
4714 current_concurrent_work_part == 0 ? 0 :
4715 host_part_xadj(current_concurrent_work_part - 1);
4716 mj_lno_t coordinate_end_index =
4717 host_part_xadj(current_concurrent_work_part);
4719 mj_lno_t total_on_cut;
4720 Kokkos::parallel_reduce(
"Get total_on_cut",
4721 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (
4722 coordinate_begin_index, coordinate_end_index),
4723 KOKKOS_LAMBDA(
int ii, mj_lno_t & val) {
4724 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4725 mj_part_t coordinate_assigned_place =
4726 local_assigned_part_ids(coordinate_index);
4727 if(coordinate_assigned_place % 2 == 1) {
4732 Kokkos::View<mj_lno_t *, device_t> track_on_cuts;
4733 if(total_on_cut > 0) {
4734 track_on_cuts = Kokkos::View<mj_lno_t *, device_t>(
4745 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4748 int use_num_teams = mj_num_teams ? mj_num_teams : 60;
4750 auto policy_ReduceFunctor = policy_t(use_num_teams, Kokkos::AUTO);
4751 typedef int array_t;
4755#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4756 Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array(
"reduce_array", num_parts);
4759 ReduceArrayFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4760 typename mj_node_t::device_type, array_t>teamFunctor(
4761 current_concurrent_work_part,
4763 coordinate_permutations,
4764 mj_current_dim_coords,
4768#
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4773#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4774 Kokkos::parallel_for(policy_ReduceFunctor, teamFunctor);
4776 Kokkos::parallel_reduce(policy_ReduceFunctor, teamFunctor, reduce_array);
4780#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4781 for(mj_part_t part = 0; part < num_parts; ++part) {
4782 local_point_counts(part) = reduce_array[part];
4788 if(track_on_cuts.size() > 0) {
4789 auto track_on_cuts_sort = Kokkos::subview(track_on_cuts,
4790 std::pair<mj_lno_t, mj_lno_t>(0, track_on_cuts.size() - 1));
4791 Kokkos::sort(track_on_cuts_sort);
4794 bool uniform_weights0 = this->mj_uniform_weights(0);
4795 Kokkos::parallel_for(
4796 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4797 KOKKOS_LAMBDA (
int dummy) {
4799 for(
int j = 0; j < total_on_cut; ++j) {
4800 int ii = track_on_cuts(j);
4801 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4802 mj_scalar_t coordinate_weight = uniform_weights0 ? 1 :
4803 local_mj_weights(coordinate_index,0);
4804 mj_part_t coordinate_assigned_place =
4805 local_assigned_part_ids(coordinate_index);
4806 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
4808 if(local_distribute_points_on_cut_lines &&
4809 local_thread_cut_line_weight_to_put_left(
4810 coordinate_assigned_part) > local_sEpsilon) {
4814 local_thread_cut_line_weight_to_put_left(
4815 coordinate_assigned_part) -= coordinate_weight;
4820 if(local_thread_cut_line_weight_to_put_left(
4821 coordinate_assigned_part) < 0 && coordinate_assigned_part <
4823 std::abs(current_concurrent_cut_coordinate(
4824 coordinate_assigned_part+1) -
4825 current_concurrent_cut_coordinate(
4826 coordinate_assigned_part)) < local_sEpsilon)
4828 local_thread_cut_line_weight_to_put_left(
4829 coordinate_assigned_part + 1) +=
4830 local_thread_cut_line_weight_to_put_left(
4831 coordinate_assigned_part);
4833 ++local_point_counts(coordinate_assigned_part);
4834 local_assigned_part_ids(coordinate_index) =
4835 coordinate_assigned_part;
4840 ++coordinate_assigned_part;
4843 while(local_distribute_points_on_cut_lines &&
4844 coordinate_assigned_part < num_cuts)
4847 if(std::abs(current_concurrent_cut_coordinate(
4848 coordinate_assigned_part) -
4849 current_concurrent_cut_coordinate(
4850 coordinate_assigned_part - 1)) < local_sEpsilon)
4853 if(local_thread_cut_line_weight_to_put_left(
4854 coordinate_assigned_part) > local_sEpsilon &&
4855 local_thread_cut_line_weight_to_put_left(
4856 coordinate_assigned_part) >=
4857 std::abs(local_thread_cut_line_weight_to_put_left(
4858 coordinate_assigned_part) - coordinate_weight))
4860 local_thread_cut_line_weight_to_put_left(
4861 coordinate_assigned_part) -= coordinate_weight;
4865 if(local_thread_cut_line_weight_to_put_left(
4866 coordinate_assigned_part) < 0 &&
4867 coordinate_assigned_part < num_cuts - 1 &&
4868 std::abs(current_concurrent_cut_coordinate(
4869 coordinate_assigned_part+1) -
4870 current_concurrent_cut_coordinate(
4871 coordinate_assigned_part)) < local_sEpsilon)
4873 local_thread_cut_line_weight_to_put_left(
4874 coordinate_assigned_part + 1) +=
4875 local_thread_cut_line_weight_to_put_left(
4876 coordinate_assigned_part);
4884 ++coordinate_assigned_part;
4886 local_point_counts(coordinate_assigned_part) += 1;
4887 local_assigned_part_ids(coordinate_index) = coordinate_assigned_part;
4891 for(
int j = 0; j < num_parts; ++j) {
4892 out_part_xadj(j) = local_point_counts(j);
4893 local_point_counts(j) = 0;
4896 out_part_xadj(j) += out_part_xadj(j - 1);
4897 local_point_counts(j) += out_part_xadj(j - 1);
4905#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4910 Kokkos::parallel_for(
4911 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
4912 coordinate_begin_index, coordinate_end_index),
4913 KOKKOS_LAMBDA (mj_lno_t ii) {
4914 mj_lno_t i = local_coordinate_permutations(ii);
4915 mj_part_t p = local_assigned_part_ids(i);
4916 mj_lno_t idx = Kokkos::atomic_fetch_add(&local_point_counts(p), 1);
4917 local_new_coordinate_permutations(coordinate_begin_index + idx) = i;
4922#ifdef KOKKOS_ENABLE_OPENMP
4924 const int num_threads = 1;
4926 const int num_threads = 1;
4929 const int num_teams = 1;
4932 Kokkos::View<mj_lno_t*, device_t>
4933 point_counter(
"insert indices", num_teams * num_threads * num_parts);
4937 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
4938 block_policy(num_teams, num_threads);
4939 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
4940 member_type member_type;
4941 mj_lno_t range = coordinate_end_index - coordinate_begin_index;
4942 mj_lno_t block_size = range / num_teams + 1;
4943 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4944 int team = team_member.league_rank();
4945 int team_offset = team * num_threads * num_parts;
4946 mj_lno_t begin = coordinate_begin_index + team * block_size;
4947 mj_lno_t end = begin + block_size;
4948 if(end > coordinate_end_index) {
4949 end = coordinate_end_index;
4952 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4954 int thread = team_member.team_rank();
4955 mj_lno_t i = local_coordinate_permutations(ii);
4956 mj_part_t p = local_assigned_part_ids(i);
4957 int index = team_offset + thread * num_parts + p;
4958 ++point_counter(index);
4966 Kokkos::parallel_for(
4967 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4968 KOKKOS_LAMBDA (
int dummy) {
4969 int num_sets = point_counter.size() / num_parts;
4970 for(
int set = num_sets - 1; set >= 1; set -=1) {
4971 int base = set * num_parts;
4972 for(
int part = 0; part < num_parts; ++part) {
4973 point_counter(base + part) = point_counter(base + part - num_parts);
4977 for(
int part = 0; part < num_parts; ++part) {
4978 point_counter(part) = 0;
4981 for(
int set = 1; set < num_sets; ++set) {
4982 int base = set * num_parts;
4983 for(
int part = 0; part < num_parts; ++part) {
4984 point_counter(base + part) += point_counter(base + part - num_parts);
4990 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4991 int team = team_member.league_rank();
4992 int team_offset = team * num_threads * num_parts;
4993 mj_lno_t begin = coordinate_begin_index + team * block_size;
4994 mj_lno_t end = begin + block_size;
4995 if(end > coordinate_end_index) {
4996 end = coordinate_end_index;
4998 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
5000 int thread = team_member.team_rank();
5001 mj_lno_t i = local_coordinate_permutations(ii);
5002 mj_part_t p = local_assigned_part_ids(i);
5003 int index = team_offset + thread * num_parts + p;
5004 int set_counter = (point_counter(index)++) + local_point_counts(p);
5005 local_new_coordinate_permutations(coordinate_begin_index + set_counter) = i;
5054template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5055 typename mj_part_t,
typename mj_node_t>
5056void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5057 mj_node_t>::mj_get_new_cut_coordinates(
5058 mj_part_t current_concurrent_num_parts,
5060 const mj_part_t &num_cuts,
5061 const double &used_imbalance_tolerance,
5062 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
5063 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
5064 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
5065 Kokkos::View<bool *, device_t> & current_cut_line_determined,
5066 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
5067 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
5068 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
5069 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
5070 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
5071 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
5072 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
5073 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
5074 Kokkos::View<mj_scalar_t *, device_t> &
5075 current_part_cut_line_weight_to_put_left,
5076 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count)
5078 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
5080 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
5081 auto local_sEpsilon = sEpsilon;
5082 auto local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
5083 auto local_global_rectilinear_cut_weight = global_rectilinear_cut_weight;
5084 auto local_process_rectilinear_cut_weight = process_rectilinear_cut_weight;
5085 auto local_global_min_max_coord_total_weight =
5086 global_min_max_coord_total_weight;
5088 const auto _sEpsilon = this->sEpsilon;
5096 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
5097 policy_one_team(1, Kokkos::AUTO());
5098 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
5099 member_type member_type;
5100 Kokkos::parallel_for(policy_one_team, KOKKOS_LAMBDA(member_type team_member) {
5102 mj_scalar_t min_coordinate =
5103 local_global_min_max_coord_total_weight(kk);
5104 mj_scalar_t max_coordinate =
5105 local_global_min_max_coord_total_weight(
5106 kk + current_concurrent_num_parts);
5107 mj_scalar_t global_total_weight =
5108 local_global_min_max_coord_total_weight(
5109 kk + current_concurrent_num_parts * 2);
5111 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5116 current_global_left_closest_points(i) > local_sEpsilon) {
5117 current_global_left_closest_points(i) =
5118 current_cut_coordinates(i);
5120 if(current_global_right_closest_points(i) -
5121 max_coordinate > local_sEpsilon) {
5122 current_global_right_closest_points(i) =
5123 current_cut_coordinates(i);
5126 team_member.team_barrier();
5128 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5130 using algMJ_t = AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5133 mj_scalar_t seen_weight_in_part = 0;
5135 mj_scalar_t expected_weight_in_part = 0;
5137 double imbalance_on_left = 0, imbalance_on_right = 0;
5138 if(local_distribute_points_on_cut_lines) {
5140 local_global_rectilinear_cut_weight(i) = 0;
5141 local_process_rectilinear_cut_weight(i) = 0;
5143 bool bContinue =
false;
5146 if(current_cut_line_determined(i)) {
5147 new_current_cut_coordinates(i) =
5148 current_cut_coordinates(i);
5153 seen_weight_in_part = current_global_part_weights(i * 2);
5156 expected_weight_in_part = current_part_target_weights(i);
5159 imbalance_on_left = algMJ_t::calculate_imbalance(seen_weight_in_part,
5160 expected_weight_in_part);
5163 imbalance_on_right = algMJ_t::calculate_imbalance(global_total_weight -
5164 seen_weight_in_part, global_total_weight - expected_weight_in_part);
5165 bool is_left_imbalance_valid = std::abs(imbalance_on_left) -
5166 used_imbalance_tolerance < local_sEpsilon ;
5167 bool is_right_imbalance_valid = std::abs(imbalance_on_right) -
5168 used_imbalance_tolerance < local_sEpsilon;
5170 if(is_left_imbalance_valid && is_right_imbalance_valid) {
5171 current_cut_line_determined(i) =
true;
5172 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5173 new_current_cut_coordinates(i) = current_cut_coordinates(i);
5175 else if(imbalance_on_left < 0) {
5177 if(local_distribute_points_on_cut_lines) {
5182 if(current_global_part_weights(i * 2 + 1) ==
5183 expected_weight_in_part) {
5185 current_cut_line_determined(i) =
true;
5186 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5189 new_current_cut_coordinates(i) =
5190 current_cut_coordinates(i);
5192 current_part_cut_line_weight_to_put_left(i) =
5193 current_local_part_weights(i * 2 + 1) -
5194 current_local_part_weights(i * 2);
5197 else if(current_global_part_weights(i * 2 + 1) >
5198 expected_weight_in_part) {
5201 current_cut_line_determined(i) =
true;
5202 Kokkos::atomic_add(&view_rectilinear_cut_count(0), 1);
5206 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5207 new_current_cut_coordinates(i) =
5208 current_cut_coordinates(i);
5209 local_process_rectilinear_cut_weight[i] =
5210 current_local_part_weights(i * 2 + 1) -
5211 current_local_part_weights(i * 2);
5220 current_cut_lower_bounds(i) =
5221 current_global_right_closest_points(i);
5224 current_cut_lower_bound_weights(i) = seen_weight_in_part;
5229 for(mj_part_t ii = i + 1; ii < num_cuts ; ++ii) {
5230 mj_scalar_t p_weight = current_global_part_weights(ii * 2);
5231 mj_scalar_t line_weight =
5232 current_global_part_weights(ii * 2 + 1);
5233 if(p_weight >= expected_weight_in_part) {
5238 if(p_weight == expected_weight_in_part) {
5239 current_cut_upper_bounds(i) =
5240 current_cut_coordinates(ii);
5241 current_cut_upper_weights(i) = p_weight;
5242 current_cut_lower_bounds(i) =
5243 current_cut_coordinates(ii);
5244 current_cut_lower_bound_weights(i) = p_weight;
5245 }
else if(p_weight < current_cut_upper_weights(i)) {
5248 current_cut_upper_bounds(i) =
5249 current_global_left_closest_points(ii);
5250 current_cut_upper_weights(i) = p_weight;
5256 if(line_weight >= expected_weight_in_part) {
5260 current_cut_upper_bounds(i) =
5261 current_cut_coordinates(ii);
5262 current_cut_upper_weights(i) = line_weight;
5263 current_cut_lower_bounds(i) =
5264 current_cut_coordinates(ii);
5265 current_cut_lower_bound_weights(i) = p_weight;
5270 if(p_weight <= expected_weight_in_part && p_weight >=
5271 current_cut_lower_bound_weights(i)) {
5272 current_cut_lower_bounds(i) =
5273 current_global_right_closest_points(ii);
5274 current_cut_lower_bound_weights(i) = p_weight;
5278 mj_scalar_t new_cut_position = 0;
5279 algMJ_t::mj_calculate_new_cut_position(
5280 current_cut_upper_bounds(i),
5281 current_cut_lower_bounds(i),
5282 current_cut_upper_weights(i),
5283 current_cut_lower_bound_weights(i),
5284 expected_weight_in_part, new_cut_position,
5289 if(std::abs(current_cut_coordinates(i) -
5290 new_cut_position) < local_sEpsilon) {
5291 current_cut_line_determined(i) =
true;
5292 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5295 new_current_cut_coordinates(i) =
5296 current_cut_coordinates(i);
5298 new_current_cut_coordinates(i) = new_cut_position;
5304 current_cut_upper_bounds(i) =
5305 current_global_left_closest_points(i);
5306 current_cut_upper_weights(i) =
5307 seen_weight_in_part;
5310 for(
int ii = i - 1; ii >= 0; --ii) {
5311 mj_scalar_t p_weight =
5312 current_global_part_weights(ii * 2);
5313 mj_scalar_t line_weight =
5314 current_global_part_weights(ii * 2 + 1);
5315 if(p_weight <= expected_weight_in_part) {
5316 if(p_weight == expected_weight_in_part) {
5319 current_cut_upper_bounds(i) =
5320 current_cut_coordinates(ii);
5321 current_cut_upper_weights(i) = p_weight;
5322 current_cut_lower_bounds(i) =
5323 current_cut_coordinates(ii);
5324 current_cut_lower_bound_weights(i) = p_weight;
5326 else if(p_weight > current_cut_lower_bound_weights(i)) {
5329 current_cut_lower_bounds(i) =
5330 current_global_right_closest_points(ii);
5331 current_cut_lower_bound_weights(i) = p_weight;
5337 if(line_weight > expected_weight_in_part) {
5338 current_cut_upper_bounds(i) =
5339 current_global_right_closest_points(ii);
5340 current_cut_upper_weights(i) = line_weight;
5350 if(p_weight >= expected_weight_in_part &&
5351 (p_weight < current_cut_upper_weights(i) ||
5352 (p_weight == current_cut_upper_weights(i) &&
5353 current_cut_upper_bounds(i) >
5354 current_global_left_closest_points(ii)))) {
5355 current_cut_upper_bounds(i) =
5356 current_global_left_closest_points(ii);
5357 current_cut_upper_weights(i) = p_weight;
5360 mj_scalar_t new_cut_position = 0;
5361 algMJ_t::mj_calculate_new_cut_position(
5362 current_cut_upper_bounds(i),
5363 current_cut_lower_bounds(i),
5364 current_cut_upper_weights(i),
5365 current_cut_lower_bound_weights(i),
5366 expected_weight_in_part,
5371 if(std::abs(current_cut_coordinates(i) -
5372 new_cut_position) < local_sEpsilon) {
5373 current_cut_line_determined(i) =
true;
5374 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5376 new_current_cut_coordinates(i) =
5377 current_cut_coordinates(i);
5379 new_current_cut_coordinates(i) =
5386 team_member.team_barrier();
5390 mj_part_t rectilinear_cut_count;
5391 Kokkos::parallel_reduce(
"Read bDoingWork",
5392 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
5393 KOKKOS_LAMBDA(
int dummy,
int & set_single) {
5394 set_single = view_rectilinear_cut_count(0);
5395 }, rectilinear_cut_count);
5397 if(rectilinear_cut_count > 0) {
5398 auto host_local_process_rectilinear_cut_weight =
5399 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5400 local_process_rectilinear_cut_weight);
5401 auto host_local_global_rectilinear_cut_weight =
5402 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5403 local_global_rectilinear_cut_weight);
5404 Kokkos::deep_copy(host_local_process_rectilinear_cut_weight,
5405 local_process_rectilinear_cut_weight);
5406 Kokkos::deep_copy(host_local_global_rectilinear_cut_weight,
5407 local_global_rectilinear_cut_weight);
5408 Teuchos::scan<int,mj_scalar_t>(
5409 *comm, Teuchos::REDUCE_SUM,
5411 host_local_process_rectilinear_cut_weight.data(),
5412 host_local_global_rectilinear_cut_weight.data());
5413 Kokkos::deep_copy(local_process_rectilinear_cut_weight,
5414 host_local_process_rectilinear_cut_weight);
5415 Kokkos::deep_copy(local_global_rectilinear_cut_weight,
5416 host_local_global_rectilinear_cut_weight);
5418 Kokkos::parallel_for(
"finish up mj_get_new_cut_coordinates",
5419 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5420 KOKKOS_LAMBDA(
int dummy) {
5421 for(mj_part_t i = 0; i < num_cuts; ++i) {
5423 if(local_global_rectilinear_cut_weight(i) > 0) {
5425 mj_scalar_t expected_part_weight = current_part_target_weights(i);
5427 mj_scalar_t necessary_weight_on_line_for_left =
5428 expected_part_weight - current_global_part_weights(i * 2);
5431 mj_scalar_t my_weight_on_line =
5432 local_process_rectilinear_cut_weight(i);
5436 mj_scalar_t weight_on_line_upto_process_inclusive =
5437 local_global_rectilinear_cut_weight(i);
5441 mj_scalar_t space_to_put_left =
5442 necessary_weight_on_line_for_left -
5443 weight_on_line_upto_process_inclusive;
5446 mj_scalar_t space_left_to_me =
5447 space_to_put_left + my_weight_on_line;
5460 if(space_left_to_me < 0) {
5463 current_part_cut_line_weight_to_put_left(i) = 0;
5465 else if(space_left_to_me >= my_weight_on_line) {
5469 current_part_cut_line_weight_to_put_left(i) =
5476 current_part_cut_line_weight_to_put_left(i) =
5483 view_rectilinear_cut_count(0) = 0;
5487 Kokkos::deep_copy(this->incomplete_cut_count, device_incomplete_cut_count);
5499template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5500 typename mj_part_t,
typename mj_node_t>
5501void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5502 get_processor_num_points_in_parts(
5503 mj_part_t num_procs,
5504 mj_part_t num_parts,
5505 mj_gno_t *&num_points_in_all_processor_parts)
5508 size_t allocation_size = num_parts * (num_procs + 1);
5515 mj_gno_t *num_local_points_in_each_part_to_reduce_sum =
5516 new mj_gno_t[allocation_size];
5520 mj_gno_t *my_local_points_to_reduce_sum =
5521 num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
5525 mj_gno_t *my_local_point_counts_in_each_part =
5526 num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
5529 memset(num_local_points_in_each_part_to_reduce_sum, 0,
5530 sizeof(mj_gno_t)*allocation_size);
5532 auto local_new_part_xadj = this->new_part_xadj;
5533 Kokkos::View<mj_gno_t *, typename mj_node_t::device_type> points_per_part(
5534 Kokkos::ViewAllocateWithoutInitializing(
"points per part"), num_parts);
5535 Kokkos::parallel_for(
"get vals on device",
5536 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_gno_t>
5537 (0, num_parts), KOKKOS_LAMBDA(mj_gno_t i) {
5538 points_per_part(i) =
5539 local_new_part_xadj(i) - ((i == 0) ? 0 : local_new_part_xadj(i-1));
5541 auto host_points_per_part = Kokkos::create_mirror_view(points_per_part);
5542 Kokkos::deep_copy(host_points_per_part, points_per_part);
5543 for(
int i = 0; i < num_parts; ++i) {
5544 my_local_points_to_reduce_sum[i] = host_points_per_part(i);
5549 memcpy (my_local_point_counts_in_each_part, my_local_points_to_reduce_sum,
5550 sizeof(mj_gno_t) * (num_parts) );
5557 reduceAll<int, mj_gno_t>(
5559 Teuchos::REDUCE_SUM,
5561 num_local_points_in_each_part_to_reduce_sum,
5562 num_points_in_all_processor_parts);
5566 delete [] num_local_points_in_each_part_to_reduce_sum;
5584template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5585 typename mj_part_t,
typename mj_node_t>
5586bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5587 mj_check_to_migrate(
5588 size_t migration_reduce_all_population,
5589 mj_lno_t num_coords_for_last_dim_part,
5590 mj_part_t num_procs,
5591 mj_part_t num_parts,
5592 mj_gno_t *num_points_in_all_processor_parts)
5595 if(migration_reduce_all_population > future_reduceall_cutoff) {
5600 if(num_coords_for_last_dim_part < min_work_last_dim) {
5605 if(this->check_migrate_avoid_migration_option == 0) {
5606 double global_imbalance = 0;
5608 size_t global_shift = num_procs * num_parts;
5610 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5611 for(mj_part_t i = 0; i < num_parts; ++i) {
5612 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
5613 / double(num_procs);
5615 global_imbalance += std::abs(ideal_num -
5616 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
5619 global_imbalance /= num_parts;
5620 global_imbalance /= num_procs;
5622 if(global_imbalance <= this->minimum_migration_imbalance) {
5648template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5649 typename mj_part_t,
typename mj_node_t>
5650void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5651 assign_send_destinations(
5652 mj_part_t num_parts,
5653 mj_part_t *part_assignment_proc_begin_indices,
5654 mj_part_t *processor_chains_in_parts,
5655 mj_lno_t *send_count_to_each_proc,
5656 int *coordinate_destinations) {
5658 auto host_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
5659 deep_copy(host_new_part_xadj, this->new_part_xadj);
5661 auto host_new_coordinate_permutations =
5662 Kokkos::create_mirror_view(this->new_coordinate_permutations);
5663 deep_copy(host_new_coordinate_permutations, this->new_coordinate_permutations);
5665 for(mj_part_t p = 0; p < num_parts; ++p) {
5666 mj_lno_t part_begin = 0;
5667 if(p > 0) part_begin = host_new_part_xadj(p - 1);
5668 mj_lno_t part_end = host_new_part_xadj(p);
5670 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
5672 mj_lno_t num_total_send = 0;
5673 for(mj_lno_t j=part_begin; j < part_end; j++) {
5674 mj_lno_t local_ind = host_new_coordinate_permutations(j);
5675 while (num_total_send >= send_count_to_each_proc[proc_to_sent]) {
5679 part_assignment_proc_begin_indices[p] =
5680 processor_chains_in_parts[proc_to_sent];
5682 processor_chains_in_parts[proc_to_sent] = -1;
5684 proc_to_sent = part_assignment_proc_begin_indices[p];
5687 coordinate_destinations[local_ind] = proc_to_sent;
5713template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5714 typename mj_part_t,
typename mj_node_t>
5715void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5716 mj_assign_proc_to_parts(
5717 mj_gno_t * num_points_in_all_processor_parts,
5718 mj_part_t num_parts,
5719 mj_part_t num_procs,
5720 mj_lno_t *send_count_to_each_proc,
5721 std::vector<mj_part_t> &processor_ranks_for_subcomm,
5722 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5723 mj_part_t &out_part_index,
5724 mj_part_t &output_part_numbering_begin_index,
5725 int * coordinate_destinations) {
5726 mj_gno_t *global_num_points_in_parts =
5727 num_points_in_all_processor_parts + num_procs * num_parts;
5728 mj_part_t *num_procs_assigned_to_each_part =
new mj_part_t[num_parts];
5731 bool did_i_find_my_group =
false;
5733 mj_part_t num_free_procs = num_procs;
5734 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
5736 double max_imbalance_difference = 0;
5737 mj_part_t max_differing_part = 0;
5740 for(mj_part_t i = 0; i < num_parts; i++) {
5743 double scalar_required_proc = num_procs *
5744 (double (global_num_points_in_parts[i]) /
5745 double (this->num_global_coords));
5748 mj_part_t required_proc =
5749 static_cast<mj_part_t
> (0.5 + scalar_required_proc);
5750 if(required_proc == 0) required_proc = 1;
5756 required_proc < minimum_num_procs_required_for_rest_of_parts) {
5757 required_proc = num_free_procs -
5758 (minimum_num_procs_required_for_rest_of_parts);
5762 num_free_procs -= required_proc;
5766 --minimum_num_procs_required_for_rest_of_parts;
5769 num_procs_assigned_to_each_part[i] = required_proc;
5774 double imbalance_wrt_ideal =
5775 (scalar_required_proc - required_proc) / required_proc;
5776 if(imbalance_wrt_ideal > max_imbalance_difference) {
5777 max_imbalance_difference = imbalance_wrt_ideal;
5778 max_differing_part = i;
5784 if(num_free_procs > 0) {
5785 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
5792 mj_part_t *part_assignment_proc_begin_indices =
new mj_part_t[num_parts];
5796 mj_part_t *processor_chains_in_parts =
new mj_part_t [num_procs];
5797 mj_part_t *processor_part_assignments =
new mj_part_t[num_procs];
5806 for(
int i = 0; i < num_procs; ++i ) {
5807 processor_part_assignments[i] = -1;
5808 processor_chains_in_parts[i] = -1;
5810 for(
int i = 0; i < num_parts; ++i ) {
5811 part_assignment_proc_begin_indices[i] = -1;
5817 uSignedSortItem<mj_part_t, mj_gno_t, char> *
5818 sort_item_num_part_points_in_procs =
5819 new uSignedSortItem<mj_part_t, mj_gno_t, char>[num_procs];
5821 for(mj_part_t i = 0; i < num_parts; ++i) {
5826 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5827 sort_item_num_part_points_in_procs[ii].id = ii;
5830 if(processor_part_assignments[ii] == -1) {
5831 sort_item_num_part_points_in_procs[ii].val =
5832 num_points_in_all_processor_parts[ii * num_parts + i];
5834 sort_item_num_part_points_in_procs[ii].signbit = 1;
5847 sort_item_num_part_points_in_procs[ii].val =
5848 num_points_in_all_processor_parts[ii * num_parts + i];
5849 sort_item_num_part_points_in_procs[ii].signbit = 0;
5854 uqSignsort<mj_part_t, mj_gno_t,char>
5855 (num_procs, sort_item_num_part_points_in_procs);
5867 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
5868 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
5869 mj_gno_t ideal_num_points_in_a_proc = Teuchos::as<mj_gno_t>(
5870 ceil(total_num_points_in_part /
double (required_proc_count)));
5873 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
5874 mj_part_t next_proc_to_send_id =
5875 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
5876 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc -
5877 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
5881 for(mj_part_t ii = num_procs - 1;
5882 ii >= num_procs - required_proc_count; --ii) {
5883 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
5885 processor_part_assignments[proc_id] = i;
5888 bool did_change_sign =
false;
5890 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5893 if(sort_item_num_part_points_in_procs[ii].signbit == 0) {
5894 did_change_sign =
true;
5895 sort_item_num_part_points_in_procs[ii].signbit = 1;
5902 if(did_change_sign) {
5905 uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count,
5906 sort_item_num_part_points_in_procs);
5921 if(!did_i_find_my_group) {
5922 for(mj_part_t ii = num_procs - 1; ii >=
5923 num_procs - required_proc_count; --ii) {
5925 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
5928 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
5930 if(proc_id_to_assign == this->myRank) {
5932 did_i_find_my_group =
true;
5935 part_assignment_proc_begin_indices[i] = this->myRank;
5936 processor_chains_in_parts[this->myRank] = -1;
5940 send_count_to_each_proc[this->myRank] =
5941 sort_item_num_part_points_in_procs[ii].val;
5945 for(mj_part_t in = 0; in < i; ++in) {
5946 output_part_numbering_begin_index +=
5947 (*next_future_num_parts_in_parts)[in];
5955 if(!did_i_find_my_group) {
5956 processor_ranks_for_subcomm.clear();
5964 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii) {
5965 mj_part_t nonassigned_proc_id =
5966 sort_item_num_part_points_in_procs[ii].id;
5967 mj_lno_t num_points_to_sent =
5968 sort_item_num_part_points_in_procs[ii].val;
5974 if(num_points_to_sent < 0) {
5975 cout <<
"Migration - processor assignments - for part:" << i
5976 <<
"from proc:" << nonassigned_proc_id <<
" num_points_to_sent:"
5977 << num_points_to_sent << std::endl;
5982 switch (migration_type) {
5986 while (num_points_to_sent > 0) {
5988 if(num_points_to_sent <= space_left_in_sent_proc) {
5990 space_left_in_sent_proc -= num_points_to_sent;
5992 if(this->myRank == nonassigned_proc_id) {
5994 send_count_to_each_proc[next_proc_to_send_id] =
5999 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6000 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6001 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6003 num_points_to_sent = 0;
6007 if(space_left_in_sent_proc > 0) {
6008 num_points_to_sent -= space_left_in_sent_proc;
6011 if(this->myRank == nonassigned_proc_id) {
6013 send_count_to_each_proc[next_proc_to_send_id] =
6014 space_left_in_sent_proc;
6015 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6016 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6017 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6021 ++next_proc_to_send_index;
6024 if(next_part_to_send_index < nprocs - required_proc_count ) {
6025 cout <<
"Migration - processor assignments - for part:"
6027 <<
" next_part_to_send :" << next_part_to_send_index
6028 <<
" nprocs:" << nprocs
6029 <<
" required_proc_count:" << required_proc_count
6030 <<
" Error: next_part_to_send_index <" <<
6031 <<
" nprocs - required_proc_count" << std::endl;
6036 next_proc_to_send_id =
6037 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6039 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6040 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6051 if(this->myRank == nonassigned_proc_id) {
6053 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
6057 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6058 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6059 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6061 num_points_to_sent = 0;
6062 ++next_proc_to_send_index;
6066 if(next_proc_to_send_index == num_procs) {
6067 next_proc_to_send_index = num_procs - required_proc_count;
6070 next_proc_to_send_id =
6071 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6073 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6074 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6087 this->assign_send_destinations(
6089 part_assignment_proc_begin_indices,
6090 processor_chains_in_parts,
6091 send_count_to_each_proc,
6092 coordinate_destinations);
6093 delete [] part_assignment_proc_begin_indices;
6094 delete [] processor_chains_in_parts;
6095 delete [] processor_part_assignments;
6096 delete [] sort_item_num_part_points_in_procs;
6097 delete [] num_procs_assigned_to_each_part;
6115template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6116 typename mj_part_t,
typename mj_node_t>
6117void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6118 assign_send_destinations2(
6119 mj_part_t num_parts,
6120 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
6121 int *coordinate_destinations,
6122 mj_part_t &output_part_numbering_begin_index,
6123 std::vector<mj_part_t> *next_future_num_parts_in_parts)
6125 mj_part_t part_shift_amount = output_part_numbering_begin_index;
6126 mj_part_t previous_processor = -1;
6128 auto local_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
6129 Kokkos::deep_copy(local_new_part_xadj, this->new_part_xadj);
6131 auto local_new_coordinate_permutations =
6132 Kokkos::create_mirror_view(this->new_coordinate_permutations);
6133 Kokkos::deep_copy(local_new_coordinate_permutations,
6134 this->new_coordinate_permutations);
6136 for(mj_part_t i = 0; i < num_parts; ++i) {
6137 mj_part_t p = sort_item_part_to_proc_assignment[i].id;
6140 mj_lno_t part_begin_index = 0;
6143 part_begin_index = local_new_part_xadj(p - 1);
6146 mj_lno_t part_end_index = local_new_part_xadj(p);
6148 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
6149 if(this->myRank == assigned_proc && previous_processor != assigned_proc) {
6150 output_part_numbering_begin_index = part_shift_amount;
6152 previous_processor = assigned_proc;
6153 part_shift_amount += (*next_future_num_parts_in_parts)[p];
6155 for(mj_lno_t j= part_begin_index; j < part_end_index; j++) {
6156 mj_lno_t localInd = local_new_coordinate_permutations(j);
6157 coordinate_destinations[localInd] = assigned_proc;
6183template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6184 typename mj_part_t,
typename mj_node_t>
6185void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6186 mj_assign_parts_to_procs(
6187 mj_gno_t * num_points_in_all_processor_parts,
6188 mj_part_t num_parts,
6189 mj_part_t num_procs,
6190 mj_lno_t *send_count_to_each_proc,
6191 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6192 mj_part_t &out_num_part,
6193 std::vector<mj_part_t> &out_part_indices,
6194 mj_part_t &output_part_numbering_begin_index,
6195 int *coordinate_destinations) {
6198 mj_gno_t *global_num_points_in_parts =
6199 num_points_in_all_processor_parts + num_procs * num_parts;
6200 out_part_indices.clear();
6204 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment =
6205 new uSortItem<mj_part_t, mj_part_t>[num_parts];
6206 uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i =
6207 new uSortItem<mj_part_t, mj_gno_t>[num_procs];
6211 mj_lno_t work_each =
6212 mj_lno_t (this->num_global_coords / (
double (num_procs)) + 0.5f);
6216 mj_lno_t *space_in_each_processor =
new mj_lno_t[num_procs];
6219 for(mj_part_t i = 0; i < num_procs; ++i) {
6220 space_in_each_processor[i] = work_each;
6227 mj_part_t *num_parts_proc_assigned =
new mj_part_t[num_procs];
6228 memset(num_parts_proc_assigned, 0,
sizeof(mj_part_t) * num_procs);
6229 int empty_proc_count = num_procs;
6233 uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts =
6234 new uSortItem<mj_part_t, mj_gno_t>[num_parts];
6239 for(mj_part_t i = 0; i < num_parts; ++i) {
6240 sort_item_point_counts_in_parts[i].id = i;
6241 sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
6245 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
6250 for(mj_part_t j = 0; j < num_parts; ++j) {
6252 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
6255 mj_gno_t load = global_num_points_in_parts[i];
6258 mj_part_t assigned_proc = -1;
6261 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
6262 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
6267 if(empty_proc_count < num_parts - j ||
6268 num_parts_proc_assigned[ii] == 0) {
6270 sort_item_num_points_of_proc_in_part_i[ii].val =
6271 num_points_in_all_processor_parts[ii * num_parts + i];
6274 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
6278 uqsort<mj_part_t, mj_gno_t>(num_procs,
6279 sort_item_num_points_of_proc_in_part_i);
6282 for(mj_part_t iii = num_procs - 1; iii >= 0; --iii) {
6283 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
6284 if(assigned_proc == -1 ||
6285 (space_in_each_processor[ii] > space_in_each_processor[assigned_proc])) {
6288 else if(space_in_each_processor[ii] == space_in_each_processor[assigned_proc]) {
6289 if(ii < assigned_proc) {
6305 if(num_parts_proc_assigned[assigned_proc]++ == 0) {
6309 space_in_each_processor[assigned_proc] -= load;
6311 sort_item_part_to_proc_assignment[j].id = i;
6314 sort_item_part_to_proc_assignment[j].val = assigned_proc;
6317 if(assigned_proc == this->myRank) {
6319 out_part_indices.push_back(i);
6325 send_count_to_each_proc[assigned_proc] +=
6326 num_points_in_all_processor_parts[this->myRank * num_parts + i];
6329 delete [] num_parts_proc_assigned;
6330 delete [] sort_item_num_points_of_proc_in_part_i;
6331 delete [] sort_item_point_counts_in_parts;
6332 delete [] space_in_each_processor;
6335 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
6338 this->assign_send_destinations2(
6340 sort_item_part_to_proc_assignment,
6341 coordinate_destinations,
6342 output_part_numbering_begin_index,
6343 next_future_num_parts_in_parts);
6345 delete [] sort_item_part_to_proc_assignment;
6372template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6373 typename mj_part_t,
typename mj_node_t>
6374void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6375 mj_migration_part_proc_assignment(
6376 mj_gno_t * num_points_in_all_processor_parts,
6377 mj_part_t num_parts,
6378 mj_part_t num_procs,
6379 mj_lno_t *send_count_to_each_proc,
6380 std::vector<mj_part_t> &processor_ranks_for_subcomm,
6381 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6382 mj_part_t &out_num_part,
6383 std::vector<mj_part_t> &out_part_indices,
6384 mj_part_t &output_part_numbering_begin_index,
6385 int *coordinate_destinations)
6387 processor_ranks_for_subcomm.clear();
6389 if(num_procs > num_parts) {
6394 mj_part_t out_part_index = 0;
6396 this->mj_assign_proc_to_parts(
6397 num_points_in_all_processor_parts,
6400 send_count_to_each_proc,
6401 processor_ranks_for_subcomm,
6402 next_future_num_parts_in_parts,
6404 output_part_numbering_begin_index,
6405 coordinate_destinations
6409 out_part_indices.clear();
6410 out_part_indices.push_back(out_part_index);
6417 processor_ranks_for_subcomm.push_back(this->myRank);
6422 this->mj_assign_parts_to_procs(
6423 num_points_in_all_processor_parts,
6426 send_count_to_each_proc,
6427 next_future_num_parts_in_parts,
6430 output_part_numbering_begin_index,
6431 coordinate_destinations);
6448template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6449 typename mj_part_t,
typename mj_node_t>
6450void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6452 mj_part_t num_procs,
6453 mj_lno_t &num_new_local_points,
6454 std::string iteration,
6455 int *coordinate_destinations,
6456 mj_part_t num_parts)
6459#ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6460 if(
sizeof(mj_lno_t) <=
sizeof(
int)) {
6464 ZOLTAN_COMM_OBJ *plan = NULL;
6465 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
6466 int num_incoming_gnos = 0;
6467 int message_tag = 7859;
6469 this->mj_env->timerStart(MACRO_TIMERS,
6470 mj_timer_base_string +
"Migration Z1PlanCreating-" + iteration);
6471 int ierr = Zoltan_Comm_Create(
6473 int(this->num_local_coords),
6474 coordinate_destinations,
6477 &num_incoming_gnos);
6480 this->mj_env->timerStop(MACRO_TIMERS,
6481 mj_timer_base_string +
"Migration Z1PlanCreating-" + iteration);
6483 this->mj_env->timerStart(MACRO_TIMERS,
6484 mj_timer_base_string +
"Migration Z1Migration-" + iteration);
6492 auto host_current_mj_gnos = Kokkos::create_mirror_view(
6493 Kokkos::HostSpace(), this->current_mj_gnos);
6494 Kokkos::deep_copy(host_current_mj_gnos, this->current_mj_gnos);
6495 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
6496 Kokkos::ViewAllocateWithoutInitializing(
"dst_gnos"), num_incoming_gnos);
6497 auto host_dst_gnos = Kokkos::create_mirror_view(
6498 Kokkos::HostSpace(), dst_gnos);
6500 ierr = Zoltan_Comm_Do(
6503 (
char *) host_current_mj_gnos.data(),
6505 (
char *) host_dst_gnos.data());
6507 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
6508 this->current_mj_gnos = dst_gnos;
6514 auto host_src_coordinates = Kokkos::create_mirror_view(
6515 Kokkos::HostSpace(), this->mj_coordinates);
6516 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6517 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6518 dst_coordinates(Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
6519 num_incoming_gnos, this->coord_dim);
6520 auto host_dst_coordinates = Kokkos::create_mirror_view(
6521 Kokkos::HostSpace(), dst_coordinates);
6522 for(
int i = 0; i < this->coord_dim; ++i) {
6523 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6524 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6525 Kokkos::View<mj_scalar_t *, Kokkos::HostSpace> sub_host_dst_coordinates
6526 = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6529 ierr = Zoltan_Comm_Do(
6532 (
char *) sub_host_src_coordinates.data(),
6533 sizeof(mj_scalar_t),
6534 (
char *) sub_host_dst_coordinates.data());
6537 deep_copy(dst_coordinates, host_dst_coordinates);
6538 this->mj_coordinates = dst_coordinates;
6543 auto host_src_weights = Kokkos::create_mirror_view(
6544 Kokkos::HostSpace(), this->mj_weights);
6545 Kokkos::deep_copy(host_src_weights, this->mj_weights);
6546 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6547 Kokkos::ViewAllocateWithoutInitializing(
"mj_weights"),
6548 num_incoming_gnos, this->num_weights_per_coord);
6549 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6550 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
6551 auto sub_host_src_weights
6552 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6553 auto sub_host_dst_weights
6554 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6555 ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6557 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6558 sent_weight[n] = sub_host_src_weights(n);
6560 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6562 ierr = Zoltan_Comm_Do(
6565 (
char *) sent_weight.getRawPtr(),
6566 sizeof(mj_scalar_t),
6567 (
char *) received_weight.getRawPtr());
6570 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6571 sub_host_dst_weights(n) = received_weight[n];
6574 deep_copy(dst_weights, host_dst_weights);
6575 this->mj_weights = dst_weights;
6581 Kokkos::View<int *, Kokkos::HostSpace> dst_owners_of_coordinate(
6582 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
6585 ierr = Zoltan_Comm_Do(
6588 (
char *) owner_of_coordinate.data(),
6590 (
char *) dst_owners_of_coordinate.data());
6592 this->owner_of_coordinate = dst_owners_of_coordinate;
6599 auto host_src_assigned_part_ids = Kokkos::create_mirror_view(
6600 Kokkos::HostSpace(), this->assigned_part_ids);
6601 Kokkos::deep_copy(host_src_assigned_part_ids, this->assigned_part_ids);
6602 Kokkos::View<int *, device_t> dst_assigned_part_ids(
6603 Kokkos::ViewAllocateWithoutInitializing(
"assigned_part_ids"),
6605 auto host_dst_assigned_part_ids = Kokkos::create_mirror_view(
6606 Kokkos::HostSpace(), dst_assigned_part_ids);
6607 mj_part_t *new_parts =
new mj_part_t[num_incoming_gnos];
6608 if(num_procs < num_parts) {
6610 ierr = Zoltan_Comm_Do(
6613 (
char *) host_src_assigned_part_ids.data(),
6615 (
char *) host_dst_assigned_part_ids.data());
6617 Kokkos::deep_copy(dst_assigned_part_ids, host_dst_assigned_part_ids);
6621 this->assigned_part_ids = dst_assigned_part_ids;
6624 ierr = Zoltan_Comm_Destroy(&plan);
6626 num_new_local_points = num_incoming_gnos;
6627 this->mj_env->timerStop(MACRO_TIMERS,
6628 mj_timer_base_string +
"Migration Z1Migration-" + iteration);
6633 this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6634 "Migration DistributorPlanCreating-" + iteration);
6636 Tpetra::Distributor distributor(this->comm);
6637 ArrayView<const mj_part_t> destinations( coordinate_destinations,
6638 this->num_local_coords);
6639 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6640 this->mj_env->timerStop(MACRO_TIMERS, mj_timer_base_string +
6641 "Migration DistributorPlanCreating-" + iteration);
6642 this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6643 "Migration DistributorMigration-" + iteration);
6651 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
6652 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
6655 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
6656 Kokkos::ViewAllocateWithoutInitializing(
"sent_gnos"),
6657 this->current_mj_gnos.extent(0));
6658 Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
6660 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
6662 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
6663 Kokkos::ViewAllocateWithoutInitializing(
"gids"), num_incoming_gnos);
6665 Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
6670 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6671 dst_coordinates(
"mj_coordinates", num_incoming_gnos, this->coord_dim);
6673 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
6674 host_src_coordinates(
6675 Kokkos::ViewAllocateWithoutInitializing(
"host_coords"),
6676 this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
6677 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6679 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
6680 Kokkos::ViewAllocateWithoutInitializing(
"received_coord"),
6683 for(
int i = 0; i < this->coord_dim; ++i) {
6687 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_coord
6688 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6690 distributor.doPostsAndWaits(sent_coord, 1, received_coord);
6692 Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
6698 this->mj_coordinates = dst_coordinates;
6701 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6702 "mj_weights", num_incoming_gnos, this->num_weights_per_coord);
6703 auto host_dst_weights = Kokkos::create_mirror_view(Kokkos::HostSpace(),
6706 auto host_src_weights = Kokkos::create_mirror_view_and_copy(
6707 Kokkos::HostSpace(), this->mj_weights);
6710 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
6711 Kokkos::ViewAllocateWithoutInitializing(
"send_weight_buffer"),
6712 this->num_local_coords);
6714 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
6715 Kokkos::ViewAllocateWithoutInitializing(
"received_weight_buffer"),
6718 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
6720 auto sub_host_src_weights
6721 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6723 auto sub_host_dst_weights
6724 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6731 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6732 sent_weight[n] = sub_host_src_weights(n);
6735 distributor.doPostsAndWaits(sent_weight, 1, received_weight);
6738 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6739 sub_host_dst_weights(n) = received_weight[n];
6742 Kokkos::deep_copy(dst_weights, host_dst_weights);
6743 this->mj_weights = dst_weights;
6748 Kokkos::View<int *, Kokkos::HostSpace> received_owners(
6749 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
6752 distributor.doPostsAndWaits(owner_of_coordinate, 1, received_owners);
6754 this->owner_of_coordinate = received_owners;
6760 if(num_procs < num_parts) {
6761 Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partids(
6762 Kokkos::ViewAllocateWithoutInitializing(
"host_parts"),
6763 this->assigned_part_ids.extent(0));
6764 Kokkos::deep_copy(sent_partids, assigned_part_ids);
6766 Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
6767 Kokkos::ViewAllocateWithoutInitializing(
"received_partids"),
6770 distributor.doPostsAndWaits(sent_partids, 1, received_partids);
6772 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6773 (
"assigned_part_ids", num_incoming_gnos);
6774 Kokkos::deep_copy(this->assigned_part_ids, received_partids);
6777 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6778 (
"assigned_part_ids", num_incoming_gnos);
6780 this->mj_env->timerStop(MACRO_TIMERS,
"" + mj_timer_base_string +
6781 "Migration DistributorMigration-" + iteration);
6783 num_new_local_points = num_incoming_gnos;
6792template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6793 typename mj_part_t,
typename mj_node_t>
6794void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6795 create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm)
6797 mj_part_t group_size = processor_ranks_for_subcomm.size();
6798 mj_part_t *ids =
new mj_part_t[group_size];
6799 for(mj_part_t i = 0; i < group_size; ++i) {
6800 ids[i] = processor_ranks_for_subcomm[i];
6802 ArrayView<const mj_part_t> idView(ids, group_size);
6803 this->comm = this->comm->createSubcommunicator(idView);
6812template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6813 typename mj_part_t,
typename mj_node_t>
6814void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6815 fill_permutation_array(
6816 mj_part_t output_num_parts,
6817 mj_part_t num_parts)
6820 if(output_num_parts == 1) {
6821 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6822 Kokkos::parallel_for(
6823 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
6824 (0, this->num_local_coords),
6825 KOKKOS_LAMBDA(mj_lno_t i) {
6826 local_new_coordinate_permutations(i) = i;
6828 auto local_new_part_xadj = this->new_part_xadj;
6829 auto local_num_local_coords = this->num_local_coords;
6830 Kokkos::parallel_for(
6831 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6832 KOKKOS_LAMBDA(
int dummy) {
6833 local_new_part_xadj(0) = local_num_local_coords;
6837 auto local_num_local_coords = this->num_local_coords;
6838 auto local_assigned_part_ids = this->assigned_part_ids;
6839 auto local_new_part_xadj = this->new_part_xadj;
6840 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6843 Kokkos::View<mj_part_t*, device_t> part_shifts(
"part_shifts", num_parts);
6848 Kokkos::View<mj_lno_t*, device_t> num_points_in_parts(
6849 "num_points_in_parts", num_parts);
6851 Kokkos::parallel_for(
6852 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6853 KOKKOS_LAMBDA(
int dummy) {
6855 for(mj_lno_t i = 0; i < local_num_local_coords; ++i) {
6856 mj_part_t ii = local_assigned_part_ids(i);
6857 ++num_points_in_parts(ii);
6862 mj_lno_t prev_index = 0;
6863 for(mj_part_t i = 0; i < num_parts; ++i) {
6864 if(num_points_in_parts(i) > 0) {
6865 local_new_part_xadj(p) = prev_index + num_points_in_parts(i);
6866 prev_index += num_points_in_parts(i);
6867 part_shifts(i) = p++;
6872 mj_part_t assigned_num_parts = p - 1;
6873 for(;p < num_parts; ++p) {
6874 local_new_part_xadj(p) =
6875 local_new_part_xadj(assigned_num_parts);
6877 for(mj_part_t i = 0; i < output_num_parts; ++i) {
6878 num_points_in_parts(i) = local_new_part_xadj(i);
6884 for(mj_lno_t i = local_num_local_coords - 1; i >= 0; --i) {
6886 part_shifts[mj_part_t(local_assigned_part_ids(i))];
6887 local_new_coordinate_permutations(--num_points_in_parts[part]) = i;
6917template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6918 typename mj_part_t,
typename mj_node_t>
6919bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6920 mj_perform_migration(
6921 mj_part_t input_num_parts,
6922 mj_part_t &output_num_parts,
6923 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6924 mj_part_t &output_part_begin_index,
6925 size_t migration_reduce_all_population,
6926 mj_lno_t num_coords_for_last_dim_part,
6927 std::string iteration,
6928 RCP<mj_partBoxVector_t> &input_part_boxes,
6929 RCP<mj_partBoxVector_t> &output_part_boxes)
6931 mj_part_t num_procs = this->comm->getSize();
6932 this->myRank = this->comm->getRank();
6937 mj_gno_t *num_points_in_all_processor_parts =
6938 new mj_gno_t[input_num_parts * (num_procs + 1)];
6941 this->get_processor_num_points_in_parts(
6944 num_points_in_all_processor_parts);
6947 if(!this->mj_check_to_migrate(
6948 migration_reduce_all_population,
6949 num_coords_for_last_dim_part,
6952 num_points_in_all_processor_parts)) {
6953 delete [] num_points_in_all_processor_parts;
6957 mj_lno_t *send_count_to_each_proc = NULL;
6958 int *coordinate_destinations =
new int[this->num_local_coords];
6959 send_count_to_each_proc =
new mj_lno_t[num_procs];
6961 for(
int i = 0; i < num_procs; ++i) {
6962 send_count_to_each_proc[i] = 0;
6965 std::vector<mj_part_t> processor_ranks_for_subcomm;
6966 std::vector<mj_part_t> out_part_indices;
6969 this->mj_migration_part_proc_assignment(
6970 num_points_in_all_processor_parts,
6973 send_count_to_each_proc,
6974 processor_ranks_for_subcomm,
6975 next_future_num_parts_in_parts,
6978 output_part_begin_index,
6979 coordinate_destinations);
6981 delete [] send_count_to_each_proc;
6982 std::vector <mj_part_t> tmpv;
6984 std::sort (out_part_indices.begin(), out_part_indices.end());
6985 mj_part_t outP = out_part_indices.size();
6986 mj_gno_t new_global_num_points = 0;
6987 mj_gno_t *global_num_points_in_parts =
6988 num_points_in_all_processor_parts + num_procs * input_num_parts;
6990 if(this->mj_keep_part_boxes) {
6991 input_part_boxes->clear();
6996 for(mj_part_t i = 0; i < outP; ++i) {
6997 mj_part_t ind = out_part_indices[i];
6998 new_global_num_points += global_num_points_in_parts[ind];
6999 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
7000 if(this->mj_keep_part_boxes) {
7001 input_part_boxes->push_back((*output_part_boxes)[ind]);
7006 if(this->mj_keep_part_boxes) {
7007 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7008 input_part_boxes = output_part_boxes;
7009 output_part_boxes = tmpPartBoxes;
7011 next_future_num_parts_in_parts->clear();
7012 for(mj_part_t i = 0; i < outP; ++i) {
7013 mj_part_t p = tmpv[i];
7014 next_future_num_parts_in_parts->push_back(p);
7017 delete [] num_points_in_all_processor_parts;
7019 mj_lno_t num_new_local_points = 0;
7021 this->mj_migrate_coords(
7023 num_new_local_points,
7025 coordinate_destinations,
7028 delete [] coordinate_destinations;
7029 if(this->num_local_coords != num_new_local_points) {
7030 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7031 (Kokkos::ViewAllocateWithoutInitializing(
"new_coordinate_permutations"),
7032 num_new_local_points);
7033 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7034 (Kokkos::ViewAllocateWithoutInitializing(
"coordinate_permutations"),
7035 num_new_local_points);
7037 this->num_local_coords = num_new_local_points;
7038 this->num_global_coords = new_global_num_points;
7041 this->create_sub_communicator(processor_ranks_for_subcomm);
7043 processor_ranks_for_subcomm.clear();
7046 this->fill_permutation_array(output_num_parts, input_num_parts);
7069template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7070 typename mj_part_t,
typename mj_node_t>
7071void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7072 create_consistent_chunks(
7073 mj_part_t num_parts,
7074 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
7075 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
7076 mj_lno_t coordinate_begin,
7077 mj_lno_t coordinate_end,
7078 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
7079 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
7081 bool longest_dim_part,
7082 uSignedSortItem<int, mj_scalar_t, char> * p_coord_dimension_range_sorted)
7092 mj_part_t no_cuts = num_parts - 1;
7096 if(this->distribute_points_on_cut_lines) {
7097 auto local_thread_cut_line_weight_to_put_left =
7098 this->thread_cut_line_weight_to_put_left;
7099 auto local_thread_part_weight_work =
7100 this->thread_part_weight_work;
7101 auto local_sEpsilon = this->sEpsilon;
7103 Kokkos::parallel_for(
7104 Kokkos::RangePolicy<
typename mj_node_t::execution_space,
7105 mj_part_t> (0, no_cuts), KOKKOS_LAMBDA (mj_part_t i) {
7107 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
7108 if(left_weight > local_sEpsilon) {
7110 mj_scalar_t thread_ii_weight_on_cut =
7111 local_thread_part_weight_work(i * 2 + 1) -
7112 local_thread_part_weight_work(i * 2);
7113 if(thread_ii_weight_on_cut < left_weight) {
7114 local_thread_cut_line_weight_to_put_left(i) =
7115 thread_ii_weight_on_cut;
7118 local_thread_cut_line_weight_to_put_left(i) = left_weight;
7122 local_thread_cut_line_weight_to_put_left(i) = 0;
7127 auto local_least_signifiance = least_signifiance;
7128 auto local_significance_mul = significance_mul;
7129 Kokkos::parallel_for(
7130 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
7131 (0, 1), KOKKOS_LAMBDA (
int dummy) {
7135 for(mj_part_t i = no_cuts - 1; i > 0 ; --i) {
7136 mj_scalar_t cut1 = current_concurrent_cut_coordinate(i-1);
7137 mj_scalar_t cut2 = current_concurrent_cut_coordinate(i);
7138 mj_scalar_t delta = cut2 - cut1;
7139 mj_scalar_t abs_delta = (delta > 0) ? delta : -delta;
7140 if(abs_delta < local_sEpsilon) {
7141 local_thread_cut_line_weight_to_put_left(i) -=
7142 local_thread_cut_line_weight_to_put_left(i - 1);
7144 local_thread_cut_line_weight_to_put_left(i) =
7145 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
7146 local_least_signifiance) * local_significance_mul) /
7147 static_cast<mj_scalar_t
>(local_significance_mul);
7153 auto local_thread_point_counts = this->thread_point_counts;
7154 Kokkos::parallel_for(
7155 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
7156 (0, num_parts), KOKKOS_LAMBDA (mj_part_t i) {
7157 local_thread_point_counts(i) = 0;
7168 mj_part_t *cut_map =
new mj_part_t[no_cuts];
7170 typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
7171 typedef std::vector< multiSItem > multiSVector;
7172 typedef std::vector<multiSVector> multiS2Vector;
7175 std::vector<mj_scalar_t *>allocated_memory;
7178 multiS2Vector sort_vector_points_on_cut;
7181 mj_part_t different_cut_count = 1;
7187 multiSVector tmpMultiSVector;
7188 sort_vector_points_on_cut.push_back(tmpMultiSVector);
7190 auto local_current_concurrent_cut_coordinate =
7191 current_concurrent_cut_coordinate;
7192 auto host_current_concurrent_cut_coordinate =
7193 Kokkos::create_mirror_view(local_current_concurrent_cut_coordinate);
7194 Kokkos::deep_copy(host_current_concurrent_cut_coordinate,
7195 local_current_concurrent_cut_coordinate);
7197 for(mj_part_t i = 1; i < no_cuts ; ++i) {
7200 if(std::abs(host_current_concurrent_cut_coordinate(i) -
7201 host_current_concurrent_cut_coordinate(i-1)) < this->sEpsilon) {
7202 cut_map[i] = cut_map[i-1];
7205 cut_map[i] = different_cut_count++;
7206 multiSVector tmp2MultiSVector;
7207 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
7210 Kokkos::deep_copy(current_concurrent_cut_coordinate,
7211 host_current_concurrent_cut_coordinate);
7214 auto host_coordinate_permutations =
7215 Kokkos::create_mirror_view(coordinate_permutations);
7216 Kokkos::deep_copy(host_coordinate_permutations, coordinate_permutations);
7218 auto host_assigned_part_ids = Kokkos::create_mirror_view(assigned_part_ids);
7219 Kokkos::deep_copy(host_assigned_part_ids, assigned_part_ids);
7221 auto host_mj_coordinates = Kokkos::create_mirror_view(mj_coordinates);
7222 Kokkos::deep_copy(host_mj_coordinates, mj_coordinates);
7224 auto host_thread_point_counts = Kokkos::create_mirror_view(thread_point_counts);
7225 Kokkos::deep_copy(host_thread_point_counts, thread_point_counts);
7227 auto local_coord_dim = this->coord_dim;
7229 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7230 mj_lno_t i = host_coordinate_permutations(ii);
7231 mj_part_t pp = host_assigned_part_ids(i);
7232 mj_part_t p = pp / 2;
7235 mj_scalar_t *vals =
new mj_scalar_t[local_coord_dim -1];
7236 allocated_memory.push_back(vals);
7241 if(longest_dim_part) {
7243 for(
int dim = local_coord_dim - 2; dim >= 0; --dim) {
7246 int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
7251 host_mj_coordinates(i,next_largest_coord_dim);
7255 for(
int dim = coordInd + 1; dim < local_coord_dim; ++dim) {
7256 vals[val_ind++] = host_mj_coordinates(i,dim);
7258 for(
int dim = 0; dim < coordInd; ++dim) {
7259 vals[val_ind++] = host_mj_coordinates(i,dim);
7263 multiSItem tempSortItem(i, local_coord_dim -1, vals);
7265 mj_part_t cmap = cut_map[p];
7266 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
7270 ++host_thread_point_counts(p);
7271 host_assigned_part_ids(i) = p;
7276 for(mj_part_t i = 0; i < different_cut_count; ++i) {
7277 std::sort (sort_vector_points_on_cut[i].begin(),
7278 sort_vector_points_on_cut[i].end());
7281 mj_part_t previous_cut_map = cut_map[0];
7283 auto host_thread_cut_line_weight_to_put_left =
7284 Kokkos::create_mirror_view(thread_cut_line_weight_to_put_left);
7285 Kokkos::deep_copy(host_thread_cut_line_weight_to_put_left,
7286 thread_cut_line_weight_to_put_left);
7288 auto host_mj_weights = Kokkos::create_mirror_view(mj_weights);
7289 Kokkos::deep_copy(host_mj_weights, mj_weights);
7299 mj_scalar_t weight_stolen_from_previous_part = 0;
7300 for(mj_part_t p = 0; p < no_cuts; ++p) {
7301 mj_part_t mapped_cut = cut_map[p];
7305 if(previous_cut_map != mapped_cut) {
7306 mj_lno_t sort_vector_end = (mj_lno_t)
7307 sort_vector_points_on_cut[previous_cut_map].size() - 1;
7308 for(; sort_vector_end >= 0; --sort_vector_end) {
7310 sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7311 mj_lno_t i = t.index;
7312 ++host_thread_point_counts(p);
7313 host_assigned_part_ids(i) = p;
7315 sort_vector_points_on_cut[previous_cut_map].clear();
7319 mj_lno_t sort_vector_end = (mj_lno_t)
7320 sort_vector_points_on_cut[mapped_cut].size() - 1;
7326 for(; sort_vector_end >= 0; --sort_vector_end) {
7329 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
7331 mj_lno_t i = t.index;
7332 mj_scalar_t w = this->mj_uniform_weights(0) ? 1 :
7333 this->mj_weights(i,0);
7335 if(host_thread_cut_line_weight_to_put_left(p) +
7336 weight_stolen_from_previous_part> this->sEpsilon &&
7337 host_thread_cut_line_weight_to_put_left(p) +
7338 weight_stolen_from_previous_part -
7339 std::abs(host_thread_cut_line_weight_to_put_left(p) +
7340 weight_stolen_from_previous_part - w)> this->sEpsilon)
7342 host_thread_cut_line_weight_to_put_left(p) -= w;
7344 sort_vector_points_on_cut[mapped_cut].pop_back();
7346 ++host_thread_point_counts(p);
7347 host_assigned_part_ids(i) = p;
7351 if(p < no_cuts - 1 &&
7352 host_thread_cut_line_weight_to_put_left(p) < this->sEpsilon) {
7353 if(mapped_cut == cut_map[p + 1] ) {
7356 if(previous_cut_map != mapped_cut) {
7357 weight_stolen_from_previous_part =
7358 host_thread_cut_line_weight_to_put_left(p);
7363 weight_stolen_from_previous_part +=
7364 host_thread_cut_line_weight_to_put_left(p);
7368 weight_stolen_from_previous_part =
7369 -host_thread_cut_line_weight_to_put_left(p);
7378 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]) {
7379 if(previous_cut_map != mapped_cut) {
7380 weight_stolen_from_previous_part =
7381 host_thread_cut_line_weight_to_put_left(p);
7384 weight_stolen_from_previous_part +=
7385 host_thread_cut_line_weight_to_put_left(p);
7389 weight_stolen_from_previous_part =
7390 -host_thread_cut_line_weight_to_put_left(p);
7396 previous_cut_map = mapped_cut;
7401 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[
7402 previous_cut_map].size() - 1;
7408 for(; sort_vector_end >= 0; --sort_vector_end) {
7410 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7413 mj_lno_t i = t.index;
7414 ++host_thread_point_counts(no_cuts);
7415 host_assigned_part_ids(i) = no_cuts;
7418 sort_vector_points_on_cut[previous_cut_map].clear();
7422 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
7423 for(mj_lno_t i = 0; i < vSize; ++i) {
7424 delete [] allocated_memory[i];
7427 auto local_out_part_xadj = out_part_xadj;
7428 auto host_out_part_xadj = Kokkos::create_mirror_view(local_out_part_xadj);
7429 Kokkos::deep_copy(host_out_part_xadj, out_part_xadj);
7432 for(mj_part_t j = 0; j < num_parts; ++j) {
7433 host_out_part_xadj(j) = host_thread_point_counts(j);
7434 host_thread_point_counts(j) = 0;
7438 for(mj_part_t j = 1; j < num_parts; ++j) {
7439 host_out_part_xadj(j) += host_out_part_xadj(j - 1);
7444 for(mj_part_t j = 1; j < num_parts; ++j) {
7445 host_thread_point_counts(j) += host_out_part_xadj(j - 1);
7448 auto host_new_coordinate_permutations =
7449 Kokkos::create_mirror_view(new_coordinate_permutations);
7450 Kokkos::deep_copy(host_new_coordinate_permutations,
7451 new_coordinate_permutations);
7455 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7456 mj_lno_t i = host_coordinate_permutations(ii);
7457 mj_part_t p = host_assigned_part_ids(i);
7458 host_new_coordinate_permutations(coordinate_begin +
7459 host_thread_point_counts(p)++) = i;
7462 Kokkos::deep_copy(thread_point_counts, host_thread_point_counts);
7463 Kokkos::deep_copy(new_coordinate_permutations,
7464 host_new_coordinate_permutations);
7465 Kokkos::deep_copy(local_out_part_xadj, host_out_part_xadj);
7477template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7478 typename mj_part_t,
typename mj_node_t>
7479void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7481 mj_part_t current_num_parts,
7482 mj_part_t output_part_begin_index,
7483 RCP<mj_partBoxVector_t> &output_part_boxes,
7484 bool is_data_ever_migrated)
7486 this->mj_env->timerStart(MACRO_TIMERS,
7487 mj_timer_base_string +
"Part_Assignment");
7489 auto local_part_xadj = part_xadj;
7490 auto local_mj_keep_part_boxes = mj_keep_part_boxes;
7491 auto local_coordinate_permutations = coordinate_permutations;
7492 auto local_assigned_part_ids = assigned_part_ids;
7494 if(local_mj_keep_part_boxes) {
7495 for(
int i = 0; i < current_num_parts; ++i) {
7496 (*output_part_boxes)[i].setpId(i + output_part_begin_index);
7500 Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy(
7501 current_num_parts, Kokkos::AUTO());
7502 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
7503 member_type member_type;
7504 Kokkos::parallel_for(policy, KOKKOS_LAMBDA(member_type team_member) {
7505 int i = team_member.league_rank();
7506 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, (i != 0) ?
7507 local_part_xadj(i-1) : 0, local_part_xadj(i)),
7509 mj_lno_t k = local_coordinate_permutations(ii);
7510 local_assigned_part_ids(k) = i + output_part_begin_index;
7514 if(is_data_ever_migrated) {
7515#ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7516 if(
sizeof(mj_lno_t) <=
sizeof(int)) {
7523 ZOLTAN_COMM_OBJ *plan = NULL;
7524 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
7527 int message_tag = 7856;
7529 this->mj_env->timerStart(MACRO_TIMERS,
7530 mj_timer_base_string +
"Final Z1PlanCreating");
7533 int ierr = Zoltan_Comm_Create( &plan,
int(this->num_local_coords),
7534 this->owner_of_coordinate.data(), mpi_comm, message_tag, &incoming);
7537 this->mj_env->timerStop(MACRO_TIMERS,
7538 mj_timer_base_string +
"Final Z1PlanCreating" );
7540 this->mj_env->timerStart(MACRO_TIMERS,
7541 mj_timer_base_string +
"Final Z1PlanComm");
7548 auto host_current_mj_gnos = Kokkos::create_mirror_view(
7549 Kokkos::HostSpace(), this->current_mj_gnos);
7550 deep_copy(host_current_mj_gnos, this->current_mj_gnos);
7551 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
7552 Kokkos::ViewAllocateWithoutInitializing(
"dst_gnos"), incoming);
7553 auto host_dst_gnos = Kokkos::create_mirror_view(
7554 Kokkos::HostSpace(), dst_gnos);
7556 ierr = Zoltan_Comm_Do( plan, message_tag,
7557 (
char *) host_current_mj_gnos.data(),
7558 sizeof(mj_gno_t), (
char *) host_dst_gnos.data());
7560 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
7561 this->current_mj_gnos = dst_gnos;
7564 auto host_src_part_ids = Kokkos::create_mirror_view(
7565 Kokkos::HostSpace(), this->assigned_part_ids);
7566 deep_copy(host_src_part_ids, this->assigned_part_ids);
7567 Kokkos::View<mj_part_t*, device_t> dst_part_ids(
7568 Kokkos::ViewAllocateWithoutInitializing(
"dst_part_ids"), incoming);
7569 auto host_dst_part_ids = Kokkos::create_mirror_view(
7570 Kokkos::HostSpace(), dst_part_ids);
7572 ierr = Zoltan_Comm_Do( plan, message_tag,
7573 (
char *) host_src_part_ids.data(),
7574 sizeof(mj_part_t), (
char *) host_dst_part_ids.data());
7576 Kokkos::deep_copy(dst_part_ids, host_dst_part_ids);
7577 this->assigned_part_ids = dst_part_ids;
7579 ierr = Zoltan_Comm_Destroy(&plan);
7582 this->num_local_coords = incoming;
7584 this->mj_env->timerStop(MACRO_TIMERS,
7585 mj_timer_base_string +
"Final Z1PlanComm");
7591 this->mj_env->timerStart(MACRO_TIMERS,
7592 mj_timer_base_string +
"Final DistributorPlanCreating");
7593 Tpetra::Distributor distributor(this->mj_problemComm);
7594 ArrayView<const mj_part_t> owners_of_coords(
7595 this->owner_of_coordinate.data(), this->num_local_coords);
7596 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
7597 this->mj_env->timerStop(MACRO_TIMERS,
7598 mj_timer_base_string +
"Final DistributorPlanCreating" );
7600 this->mj_env->timerStart(MACRO_TIMERS,
7601 mj_timer_base_string +
"Final DistributorPlanComm");
7607 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
7608 Kokkos::ViewAllocateWithoutInitializing(
"sent_gnos"),
7609 this->current_mj_gnos.extent(0));
7610 Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
7612 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
7613 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
7616 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
7618 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
7619 Kokkos::ViewAllocateWithoutInitializing(
"current_mj_gnos"), incoming);
7621 Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
7624 Kokkos::View<mj_part_t *, Kokkos::HostSpace> sent_partids(
7625 Kokkos::ViewAllocateWithoutInitializing(
"sent_partids"),
7626 this->assigned_part_ids.extent(0));
7627 Kokkos::deep_copy(sent_partids, this->assigned_part_ids);
7629 Kokkos::View<mj_part_t *, Kokkos::HostSpace> received_partids(
7630 Kokkos::ViewAllocateWithoutInitializing(
"received_partids"),
7633 distributor.doPostsAndWaits(sent_partids, 1, received_partids);
7635 this->assigned_part_ids =
7636 Kokkos::View<mj_part_t*, device_t>(
7637 Kokkos::ViewAllocateWithoutInitializing(
"assigned_part_ids"),
7640 Kokkos::deep_copy(this->assigned_part_ids, received_partids);
7641 this->num_local_coords = incoming;
7643 this->mj_env->timerStop(MACRO_TIMERS,
7644 mj_timer_base_string +
"Final DistributorPlanComm");
7648 this->mj_env->timerStop(MACRO_TIMERS,
7649 mj_timer_base_string +
"Part_Assignment");
7651 this->mj_env->timerStart(MACRO_TIMERS,
7652 mj_timer_base_string +
"Solution_Part_Assignment");
7657 if(this->mj_keep_part_boxes) {
7658 this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
7661 this->mj_env->timerStop(MACRO_TIMERS,
7662 mj_timer_base_string +
"Solution_Part_Assignment");
7677template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7678 typename mj_part_t,
typename mj_node_t>
7681 bool distribute_points_on_cut_lines_,
7682 int max_concurrent_part_calculation_,
7683 int check_migrate_avoid_migration_option_,
7684 double minimum_migration_imbalance_,
7685 int migration_type_)
7687 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
7688 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
7689 this->check_migrate_avoid_migration_option =
7690 check_migrate_avoid_migration_option_;
7691 this->minimum_migration_imbalance = minimum_migration_imbalance_;
7692 this->migration_type = migration_type_;
7722template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7723 typename mj_part_t,
typename mj_node_t>
7726 const RCP<const Environment> &env,
7727 RCP<
const Comm<int> > &problemComm,
7728 double imbalance_tolerance_,
7730 size_t num_global_parts_,
7731 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array_,
7732 int recursion_depth_,
7734 mj_lno_t num_local_coords_,
7735 mj_gno_t num_global_coords_,
7736 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
7738 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
7739 int num_weights_per_coord_,
7740 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights_,
7741 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
7742 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts_,
7743 Kokkos::View<mj_part_t *, device_t> & result_assigned_part_ids_,
7744 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos_)
7748 int execute_counter = Zoltan2_AlgMJ_TrackCallsCounter::get_counter_AlgMJ();
7749 this->mj_timer_base_string =
"MJ(" + std::to_string(execute_counter) +
") - ";
7752 this->mj_problemComm = problemComm;
7753 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
7755 mj_timer_base_string +
"Total");
7756 this->mj_env->debug(3,
"In MultiJagged Jagged");
7757 this->imbalance_tolerance = imbalance_tolerance_;
7758 this->mj_num_teams = num_teams_;
7759 this->num_global_parts = num_global_parts_;
7760 this->part_no_array = part_no_array_;
7761 this->recursion_depth = recursion_depth_;
7762 this->coord_dim = coord_dim_;
7763 this->num_local_coords = num_local_coords_;
7764 this->num_global_coords = num_global_coords_;
7765 this->mj_coordinates = mj_coordinates_;
7766 this->initial_mj_gnos = initial_mj_gnos_;
7767 this->num_weights_per_coord = num_weights_per_coord_;
7768 this->mj_uniform_weights = mj_uniform_weights_;
7769 this->mj_weights = mj_weights_;
7770 this->mj_uniform_parts = mj_uniform_parts_;
7774 this->set_part_specifications();
7777 mj_timer_base_string +
"Allocate Views");
7778 this->allocate_set_work_memory();
7780 mj_timer_base_string +
"Allocate Views");
7784 this->comm = this->mj_problemComm->duplicate();
7787 if(comm->getRank() == 0) {
7788 std::cout <<
"size of gno:" <<
sizeof(mj_gno_t) << std::endl;
7789 std::cout <<
"size of lno:" <<
sizeof(mj_lno_t) << std::endl;
7790 std::cout <<
"size of mj_scalar_t:" <<
sizeof(mj_scalar_t) << std::endl;
7795 mj_part_t current_num_parts = 1;
7796 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
7797 this->all_cut_coordinates;
7799 mj_timer_base_string +
"Problem_Partitioning");
7800 mj_part_t output_part_begin_index = 0;
7801 mj_part_t future_num_parts = this->total_num_part;
7802 bool is_data_ever_migrated =
false;
7804 std::vector<mj_part_t> *future_num_part_in_parts =
7805 new std::vector<mj_part_t> ();
7806 std::vector<mj_part_t> *next_future_num_parts_in_parts =
7807 new std::vector<mj_part_t> ();
7809 next_future_num_parts_in_parts->push_back(this->num_global_parts);
7811 RCP<mj_partBoxVector_t> input_part_boxes;
7812 RCP<mj_partBoxVector_t> output_part_boxes;
7814 if(this->mj_keep_part_boxes) {
7815 input_part_boxes = RCP<mj_partBoxVector_t>(
new mj_partBoxVector_t(),
true);
7816 output_part_boxes = RCP<mj_partBoxVector_t>(
new mj_partBoxVector_t(),
true);
7817 compute_global_box();
7818 this->init_part_boxes(output_part_boxes);
7821 auto local_part_xadj = this->part_xadj;
7825 Kokkos::View<mj_part_t*, device_t>
7826 view_rectilinear_cut_count(
"view_rectilinear_cut_count", 1);
7827 Kokkos::View<size_t*, device_t>
7828 view_total_reduction_size(
"view_total_reduction_size", 1);
7830 for(
int i = 0; i < this->recursion_depth; ++i) {
7833 std::string istring = std::to_string(i);
7839 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
7840 future_num_part_in_parts = next_future_num_parts_in_parts;
7841 next_future_num_parts_in_parts = tmpPartVect;
7845 next_future_num_parts_in_parts->clear();
7846 if(this->mj_keep_part_boxes) {
7847 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7848 input_part_boxes = output_part_boxes;
7849 output_part_boxes = tmpPartBoxes;
7850 output_part_boxes->clear();
7854 mj_part_t output_part_count_in_dimension =
7855 this->update_part_num_arrays(
7856 future_num_part_in_parts,
7857 next_future_num_parts_in_parts,
7862 output_part_boxes, 1);
7867 if(output_part_count_in_dimension == current_num_parts) {
7869 tmpPartVect= future_num_part_in_parts;
7870 future_num_part_in_parts = next_future_num_parts_in_parts;
7871 next_future_num_parts_in_parts = tmpPartVect;
7873 if(this->mj_keep_part_boxes) {
7874 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7875 input_part_boxes = output_part_boxes;
7876 output_part_boxes = tmpPartBoxes;
7882 int coordInd = i % this->coord_dim;
7884 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
7885 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
7888 mj_timer_base_string +
"Problem_Partitioning_" + istring);
7892 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
7893 "new part xadj", output_part_count_in_dimension);
7896 mj_part_t output_part_index = 0;
7900 mj_part_t output_coordinate_end_index = 0;
7902 mj_part_t current_work_part = 0;
7903 mj_part_t current_concurrent_num_parts =
7904 std::min(current_num_parts - current_work_part,
7905 this->max_concurrent_part_calculation);
7907 mj_part_t obtained_part_index = 0;
7909 auto host_process_local_min_max_coord_total_weight =
7910 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
7911 auto host_global_min_max_coord_total_weight =
7912 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
7915 for(; current_work_part < current_num_parts;
7916 current_work_part += current_concurrent_num_parts) {
7918 current_concurrent_num_parts =
7919 std::min(current_num_parts - current_work_part,
7920 this->max_concurrent_part_calculation);
7923 auto local_device_num_partitioning_in_current_dim =
7924 device_num_partitioning_in_current_dim;
7925 Kokkos::parallel_reduce(
"Read bDoingWork",
7926 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
7927 KOKKOS_LAMBDA(
int dummy,
int & set_single) {
7929 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7930 if(local_device_num_partitioning_in_current_dim(
7931 current_work_part + kk) != 1) {
7937 bool bDoingWork = (bDoingWork_int != 0) ?
true :
false;
7939 this->mj_get_local_min_max_coord_totW(
7941 current_concurrent_num_parts,
7942 mj_current_dim_coords);
7947 this->mj_get_global_min_max_coord_totW(
7948 current_concurrent_num_parts,
7949 this->process_local_min_max_coord_total_weight,
7950 this->global_min_max_coord_total_weight);
7954 mj_part_t total_incomplete_cut_count = 0;
7959 mj_part_t concurrent_part_cut_shift = 0;
7960 mj_part_t concurrent_part_part_shift = 0;
7962 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7964 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
7965 global_min_max_coord_total_weight);
7967 mj_scalar_t min_coordinate =
7968 host_global_min_max_coord_total_weight(kk);
7969 mj_scalar_t max_coordinate =
7970 host_global_min_max_coord_total_weight(
7971 kk + current_concurrent_num_parts);
7973 mj_scalar_t global_total_weight =
7974 host_global_min_max_coord_total_weight(
7975 kk + 2 * current_concurrent_num_parts);
7977 mj_part_t concurrent_current_part_index = current_work_part + kk;
7979 mj_part_t partition_count = host_num_partitioning_in_current_dim(
7980 concurrent_current_part_index);
7982 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
7983 Kokkos::subview(current_cut_coordinates,
7984 std::pair<mj_lno_t, mj_lno_t>(
7985 concurrent_part_cut_shift, current_cut_coordinates.size()));
7986 Kokkos::View<mj_scalar_t *, device_t>
7987 current_target_part_weights =
7988 Kokkos::subview(target_part_weights,
7989 std::pair<mj_lno_t, mj_lno_t>(
7990 concurrent_part_part_shift, target_part_weights.size()));
7993 concurrent_part_cut_shift += partition_count - 1;
7995 concurrent_part_part_shift += partition_count;
7999 if(partition_count > 1 && min_coordinate <= max_coordinate) {
8003 total_incomplete_cut_count += partition_count - 1;
8005 this->incomplete_cut_count(kk) = partition_count - 1;
8008 this->mj_get_initial_cut_coords_target_weights(
8011 partition_count - 1,
8012 global_total_weight,
8014 current_target_part_weights,
8015 future_num_part_in_parts,
8016 next_future_num_parts_in_parts,
8017 concurrent_current_part_index,
8018 obtained_part_index);
8020 mj_lno_t coordinate_end_index =
8021 host_part_xadj(concurrent_current_part_index);
8022 mj_lno_t coordinate_begin_index =
8023 concurrent_current_part_index==0 ? 0 :
8024 host_part_xadj(concurrent_current_part_index - 1);
8026 this->set_initial_coordinate_parts(
8029 coordinate_begin_index, coordinate_end_index,
8030 this->coordinate_permutations,
8031 mj_current_dim_coords,
8032 this->assigned_part_ids,
8038 this->incomplete_cut_count(kk) = 0;
8041 obtained_part_index += partition_count;
8046 double used_imbalance = 0;
8049 mj_timer_base_string +
"Problem_Partitioning Get Part Weights");
8052 mj_current_dim_coords,
8055 current_concurrent_num_parts,
8056 current_cut_coordinates,
8057 total_incomplete_cut_count,
8058 view_rectilinear_cut_count,
8059 view_total_reduction_size);
8062 mj_timer_base_string +
"Problem_Partitioning Get Part Weights");
8067 mj_part_t output_array_shift = 0;
8068 mj_part_t cut_shift = 0;
8069 size_t tlr_shift = 0;
8070 size_t partweight_array_shift = 0;
8071 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
8073 mj_part_t current_concurrent_work_part = current_work_part + kk;
8075 mj_part_t num_parts = host_num_partitioning_in_current_dim(
8076 current_concurrent_work_part);
8079 int coordinateA_bigger_than_coordinateB =
8080 host_global_min_max_coord_total_weight(kk) >
8081 host_global_min_max_coord_total_weight(
8082 kk + current_concurrent_num_parts);
8084 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
8087 auto local_new_part_xadj = this->new_part_xadj;
8088 Kokkos::parallel_for(
8089 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8090 (0, num_parts), KOKKOS_LAMBDA (mj_part_t jj) {
8091 local_new_part_xadj(
8092 output_part_index + output_array_shift + jj) = 0;
8095 cut_shift += num_parts - 1;
8096 tlr_shift += (4 *(num_parts - 1) + 1);
8097 output_array_shift += num_parts;
8098 partweight_array_shift += (2 * (num_parts - 1) + 1);
8102 Kokkos::View<mj_scalar_t *, device_t>
8103 current_concurrent_cut_coordinate =
8104 Kokkos::subview(current_cut_coordinates,
8105 std::pair<mj_lno_t, mj_lno_t>(
8107 current_cut_coordinates.size()));
8108 Kokkos::View<mj_scalar_t *, device_t>
8109 used_local_cut_line_weight_to_left =
8110 Kokkos::subview(process_cut_line_weight_to_put_left,
8111 std::pair<mj_lno_t, mj_lno_t>(
8113 process_cut_line_weight_to_put_left.size()));
8115 this->thread_part_weight_work =
8117 this->thread_part_weights,
8118 std::pair<mj_lno_t, mj_lno_t>(
8119 partweight_array_shift,
8120 this->thread_part_weights.extent(0)));
8123 if(this->mj_keep_part_boxes) {
8125 for(mj_part_t j = 0; j < num_parts - 1; ++j) {
8126 mj_scalar_t temp_get_val;
8127 Kokkos::parallel_reduce(
"Read single",
8128 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8129 KOKKOS_LAMBDA(
int dummy, mj_scalar_t & set_single) {
8130 set_single = current_concurrent_cut_coordinate(j);
8132 (*output_part_boxes)
8133 [output_array_shift + output_part_index + j].
8134 updateMinMax(temp_get_val, 1 , coordInd);
8135 (*output_part_boxes)
8136 [output_array_shift + output_part_index + j + 1].
8137 updateMinMax(temp_get_val, 0 , coordInd);
8142 Kokkos::View<mj_lno_t*, device_t> sub_new_part_xadj =
8143 Kokkos::subview(this->new_part_xadj,
8144 std::pair<mj_lno_t, mj_lno_t>(
8145 output_part_index + output_array_shift,
8146 this->new_part_xadj.size()));
8148 this->mj_create_new_partitions(
8150 current_concurrent_work_part,
8151 mj_current_dim_coords,
8152 current_concurrent_cut_coordinate,
8153 used_local_cut_line_weight_to_left,
8158 mj_lno_t coordinate_end = host_part_xadj(
8159 current_concurrent_work_part);
8160 mj_lno_t coordinate_begin =
8161 current_concurrent_work_part==0 ? 0 : host_part_xadj(
8162 current_concurrent_work_part - 1);
8166 mj_lno_t part_size = coordinate_end - coordinate_begin;
8170 auto local_new_part_xadj = this->new_part_xadj;
8171 Kokkos::parallel_for(
8172 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
8173 (0, 1), KOKKOS_LAMBDA (
int dummy) {
8174 local_new_part_xadj(
8175 output_part_index + output_array_shift) = part_size;
8178 auto subview_new_coordinate_permutations =
8179 Kokkos::subview(this->new_coordinate_permutations,
8180 std::pair<mj_lno_t, mj_lno_t>(
8182 coordinate_begin + part_size));
8183 auto subview_coordinate_permutations =
8184 Kokkos::subview(this->coordinate_permutations,
8185 std::pair<mj_lno_t, mj_lno_t>(
8187 coordinate_begin + part_size));
8188 Kokkos::deep_copy(subview_new_coordinate_permutations,
8189 subview_coordinate_permutations);
8191 cut_shift += num_parts - 1;
8192 output_array_shift += num_parts;
8193 partweight_array_shift += (2 * (num_parts - 1) + 1);
8202 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
8203 mj_part_t num_parts =
8204 host_num_partitioning_in_current_dim(current_work_part + kk);
8208 auto local_new_part_xadj = this->new_part_xadj;
8209 Kokkos::parallel_for(
8210 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8211 (0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
8212 local_new_part_xadj(output_part_index+ii) +=
8213 output_coordinate_end_index;
8218 Kokkos::parallel_reduce(
"Read single",
8219 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8220 KOKKOS_LAMBDA(
int dummy, mj_part_t & set_single) {
8222 local_new_part_xadj(output_part_index + num_parts - 1);
8224 output_coordinate_end_index = temp_get;
8226 output_part_index += num_parts;
8232 int current_world_size = this->comm->getSize();
8233 long migration_reduce_all_population =
8234 this->total_dim_num_reduce_all * current_world_size;
8235 bool is_migrated_in_current_dimension =
false;
8240 if(future_num_parts > 1 &&
8241 this->check_migrate_avoid_migration_option >= 0 &&
8242 current_world_size > 1) {
8244 mj_timer_base_string +
"Problem_Migration-" + istring);
8245 mj_part_t num_parts = output_part_count_in_dimension;
8247 if(this->mj_perform_migration(
8250 next_future_num_parts_in_parts,
8251 output_part_begin_index,
8252 migration_reduce_all_population,
8253 this->num_global_coords / (future_num_parts * current_num_parts),
8255 input_part_boxes, output_part_boxes) )
8257 is_migrated_in_current_dimension =
true;
8258 is_data_ever_migrated =
true;
8260 mj_timer_base_string +
"Problem_Migration-" + istring);
8263 this->total_dim_num_reduce_all /= num_parts;
8266 is_migrated_in_current_dimension =
false;
8268 mj_timer_base_string +
"Problem_Migration-" + istring);
8273 Kokkos::View<mj_lno_t*, device_t> tmp =
8274 this->coordinate_permutations;
8275 this->coordinate_permutations =
8276 this->new_coordinate_permutations;
8278 this->new_coordinate_permutations = tmp;
8279 if(!is_migrated_in_current_dimension) {
8280 this->total_dim_num_reduce_all -= current_num_parts;
8281 current_num_parts = output_part_count_in_dimension;
8285 this->part_xadj = this->new_part_xadj;
8286 local_part_xadj = this->new_part_xadj;
8287 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
8288 Kokkos::deep_copy(host_part_xadj, part_xadj);
8290 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
"empty", 0);
8292 mj_timer_base_string +
"Problem_Partitioning_" + istring);
8297 delete future_num_part_in_parts;
8298 delete next_future_num_parts_in_parts;
8300 mj_timer_base_string +
"Problem_Partitioning");
8306 this->set_final_parts(
8308 output_part_begin_index,
8310 is_data_ever_migrated);
8312 result_assigned_part_ids_ = this->assigned_part_ids;
8313 result_mj_gnos_ = this->current_mj_gnos;
8315 mj_timer_base_string +
"Total");
8316 this->mj_env->debug(3,
"Out of MultiJagged");
8319template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
8320 typename mj_part_t,
typename mj_node_t>
8321RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8326 if(this->mj_keep_part_boxes) {
8327 return this->kept_boxes;
8330 throw std::logic_error(
"Error: part boxes are not stored.");
8334template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
8335 typename mj_part_t,
typename mj_node_t>
8336RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8342 mj_part_t ntasks = this->num_global_parts;
8343 int dim = (*localPartBoxes)[0].getDim();
8344 coord_t *localPartBoundaries =
new coord_t[ntasks * 2 *dim];
8346 memset(localPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
8348 coord_t *globalPartBoundaries =
new coord_t[ntasks * 2 *dim];
8349 memset(globalPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
8351 coord_t *localPartMins = localPartBoundaries;
8352 coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
8354 coord_t *globalPartMins = globalPartBoundaries;
8355 coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
8357 mj_part_t boxCount = localPartBoxes->size();
8358 for(mj_part_t i = 0; i < boxCount; ++i) {
8359 mj_part_t pId = (*localPartBoxes)[i].getpId();
8363 coord_t *lmins = (*localPartBoxes)[i].getlmins();
8364 coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
8366 for(
int j = 0; j < dim; ++j) {
8367 localPartMins[dim * pId + j] = lmins[j];
8368 localPartMaxs[dim * pId + j] = lmaxs[j];
8381 reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
8382 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
8384 RCP<mj_partBoxVector_t> pB(
new mj_partBoxVector_t(),
true);
8385 for(mj_part_t i = 0; i < ntasks; ++i) {
8387 globalPartMins + dim * i,
8388 globalPartMaxs + dim * i);
8401 delete []localPartBoundaries;
8402 delete []globalPartBoundaries;
8409template <
typename Adapter>
8415#ifndef DOXYGEN_SHOULD_SKIP_THIS
8419 typedef typename Adapter::scalar_t adapter_scalar_t;
8422 typedef float default_mj_scalar_t;
8428 (std::is_same<adapter_scalar_t, float>::value ||
8429 std::is_same<adapter_scalar_t, double>::value),
8430 adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
8432 typedef typename Adapter::gno_t mj_gno_t;
8433 typedef typename Adapter::lno_t mj_lno_t;
8434 typedef typename Adapter::part_t mj_part_t;
8435 typedef typename Adapter::node_t mj_node_t;
8437 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
8438 typedef typename mj_node_t::device_type device_t;
8443 RCP<const Environment> mj_env;
8444 RCP<const Comm<int> > mj_problemComm;
8445 RCP<const typename Adapter::base_adapter_t> mj_adapter;
8448 double imbalance_tolerance;
8452 size_t num_global_parts;
8455 Kokkos::View<mj_part_t*, Kokkos::HostSpace> part_no_array;
8458 int recursion_depth;
8461 mj_lno_t num_local_coords;
8462 mj_gno_t num_global_coords;
8465 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
8469 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8472 int num_weights_per_coord;
8475 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_weights;
8478 Kokkos::View<mj_scalar_t**, device_t> mj_weights;
8481 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_parts;
8491 mj_part_t num_first_level_parts;
8495 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
8499 bool distribute_points_on_cut_lines;
8502 mj_part_t max_concurrent_part_calculation;
8505 int check_migrate_avoid_migration_option;
8513 double minimum_migration_imbalance;
8514 bool mj_keep_part_boxes;
8518 int mj_premigration_option;
8519 int min_coord_per_rank_for_premigration;
8522 ArrayRCP<mj_part_t> comXAdj_;
8525 ArrayRCP<mj_part_t> comAdj_;
8530 void set_input_parameters(
const Teuchos::ParameterList &p);
8532 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries()
const;
8534 bool mj_premigrate_to_subset(
8536 int migration_selection_option,
8537 RCP<const Environment> mj_env_,
8538 RCP<
const Comm<int> > mj_problemComm_,
8540 mj_lno_t num_local_coords_,
8541 mj_gno_t num_global_coords_,
size_t num_global_parts_,
8542 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8544 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8546 int num_weights_per_coord_,
8547 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8549 RCP<
const Comm<int> > &result_problemComm_,
8550 mj_lno_t & result_num_local_coords_,
8551 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8553 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8554 result_mj_coordinates_,
8555 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8556 int * &result_actual_owner_rank_);
8561 RCP<
const Comm<int> > &problemComm,
8562 const RCP<const typename Adapter::base_adapter_t> &adapter) :
8565 mj_problemComm(problemComm),
8566 mj_adapter(adapter),
8567 imbalance_tolerance(0),
8569 num_global_parts(1),
8572 num_local_coords(0),
8573 num_global_coords(0),
8574 num_weights_per_coord(0),
8575 num_first_level_parts(1),
8576 distribute_points_on_cut_lines(true),
8577 max_concurrent_part_calculation(1),
8578 check_migrate_avoid_migration_option(0),
8580 minimum_migration_imbalance(0.30),
8581 mj_keep_part_boxes(false),
8582 mj_run_as_rcb(false),
8583 mj_premigration_option(0),
8584 min_coord_per_rank_for_premigration(32000),
8598 const bool bUnsorted =
true;
8599 RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
8601 pl.set(
"mj_parts",
"0",
"list of parts for multiJagged partitioning "
8602 "algorithm. As many as the dimension count.", mj_parts_Validator);
8604 pl.set(
"mj_concurrent_part_count", 1,
"The number of parts whose cut "
8605 "coordinates will be calculated concurently.",
8606 Environment::getAnyIntValidator());
8608 pl.set(
"mj_minimum_migration_imbalance", 1.1,
8609 "mj_minimum_migration_imbalance, the minimum imbalance of the "
8610 "processors to avoid migration",
8611 Environment::getAnyDoubleValidator());
8613 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
8614 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 2) );
8615 pl.set(
"mj_migration_option", 1,
"Migration option, 0 for decision "
8616 "depending on the imbalance, 1 for forcing migration, 2 for "
8617 "avoiding migration", mj_migration_option_validator);
8619 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
8620 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1) );
8621 pl.set(
"mj_migration_type", 0,
8622 "Migration type, 0 for migration to minimize the imbalance "
8623 "1 for migration to minimize messages exchanged the migration.",
8624 mj_migration_option_validator);
8627 pl.set(
"mj_keep_part_boxes",
false,
"Keep the part boundaries of the "
8628 "geometric partitioning.", Environment::getBoolValidator());
8631 pl.set(
"mj_enable_rcb",
false,
"Use MJ as RCB.",
8632 Environment::getBoolValidator());
8634 pl.set(
"mj_recursion_depth", -1,
"Recursion depth for MJ: Must be "
8635 "greater than 0.", Environment::getAnyIntValidator());
8637 RCP<Teuchos::EnhancedNumberValidator<int>>
8638 mj_num_teams_validator =
8639 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(
8640 0, Teuchos::EnhancedNumberTraits<int>::max()) );
8641 pl.set(
"mj_num_teams", 0,
8642 "How many teams for the main kernel loop"
8643 , mj_num_teams_validator);
8645 RCP<Teuchos::EnhancedNumberValidator<int>>
8646 mj_premigration_option_validator =
8647 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
8649 pl.set(
"mj_premigration_option", 0,
8650 "Whether to do premigration or not. 0 for no migration "
8651 "x > 0 for migration to consecutive processors, "
8652 "the subset will be 0,x,2x,3x,...subset ranks."
8653 , mj_premigration_option_validator);
8655 pl.set(
"mj_premigration_coordinate_count", 32000,
"How many coordinate to "
8656 "assign each rank in multijagged after premigration"
8657 , Environment::getAnyIntValidator());
8669 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
8673 mj_part_t pointAssign(
int dim, adapter_scalar_t *point)
const;
8675 void boxAssign(
int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
8676 size_t &nPartsFound, mj_part_t **partsFound)
const;
8680 void getCommunicationGraph(
8682 ArrayRCP<mj_part_t> &comXAdj,
8683 ArrayRCP<mj_part_t> &comAdj);
8685 void set_up_partitioning_data(
8689 std::string timer_base_string;
8697 template<
class dst_t,
class src_t>
8698 typename std::enable_if<std::is_same<
typename dst_t::value_type,
8699 typename src_t::value_type>::value>::type
8700 assign_if_same(dst_t & dst,
const src_t & src) {
8703 template<
class dst_t,
class src_t>
8704 typename std::enable_if<!std::is_same<
typename dst_t::value_type,
8705 typename src_t::value_type>::value>::type
8706 assign_if_same(dst_t & dst,
const src_t & src) {
8711template <
typename Adapter>
8712bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
8714 int migration_selection_option,
8715 RCP<const Environment> mj_env_,
8716 RCP<
const Comm<int> > mj_problemComm_,
8718 mj_lno_t num_local_coords_,
8719 mj_gno_t num_global_coords_,
size_t num_global_parts_,
8720 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8722 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
8723 int num_weights_per_coord_,
8724 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8726 RCP<
const Comm<int> > & result_problemComm_,
8727 mj_lno_t &result_num_local_coords_,
8728 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8730 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8731 result_mj_coordinates_,
8732 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8733 int * &result_actual_owner_rank_)
8735 mj_env_->timerStart(MACRO_TIMERS,
8736 timer_base_string +
"PreMigration DistributorPlanCreating");
8738 int myRank = mj_problemComm_->getRank();
8739 int worldSize = mj_problemComm_->getSize();
8741 mj_part_t groupsize = worldSize / used_num_ranks;
8743 std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
8745 mj_part_t i_am_sending_to = 0;
8746 bool am_i_a_receiver =
false;
8748 for(
int i = 0; i < used_num_ranks; ++i) {
8749 group_begins[i+ 1] = group_begins[i] + groupsize;
8750 if(worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
8751 if(i == used_num_ranks) group_begins[i+ 1] = worldSize;
8752 if(myRank >= group_begins[i] && myRank < group_begins[i + 1]) {
8753 i_am_sending_to = group_begins[i];
8755 if(myRank == group_begins[i]) {
8756 am_i_a_receiver =
true;
8760 ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
8761 result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
8763 Tpetra::Distributor distributor(mj_problemComm_);
8765 std::vector<mj_part_t>
8766 coordinate_destinations(num_local_coords_, i_am_sending_to);
8768 ArrayView<const mj_part_t>
8769 destinations(&(coordinate_destinations[0]), num_local_coords_);
8770 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
8771 result_num_local_coords_ = num_incoming_gnos;
8772 mj_env_->timerStop(MACRO_TIMERS,
8773 timer_base_string +
"PreMigration DistributorPlanCreating");
8775 mj_env_->timerStart(MACRO_TIMERS,
8776 timer_base_string +
"PreMigration DistributorMigration");
8784 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
8785 Kokkos::ViewAllocateWithoutInitializing(
"sent_gnos"),
8786 initial_mj_gnos_.size());
8787 Kokkos::deep_copy(sent_gnos, initial_mj_gnos_);
8789 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos (
8790 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
8793 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
8795 result_initial_mj_gnos_ = Kokkos::View<mj_gno_t*, device_t>(
8796 Kokkos::ViewAllocateWithoutInitializing(
"result_initial_mj_gnos_"),
8798 Kokkos::deep_copy(result_initial_mj_gnos_, received_gnos);
8804 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
8805 host_src_coordinates(
8806 Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
8807 this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
8809 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
8811 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> dst_coordinates(
8812 Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
8813 num_incoming_gnos, this->coord_dim);
8815 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
8816 Kokkos::ViewAllocateWithoutInitializing(
"received_coord"),
8819 for(
int i = 0; i < this->coord_dim; ++i) {
8821 auto sent_coord = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
8823 distributor.doPostsAndWaits(sent_coord, 1, received_coord);
8825 Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
8829 result_mj_coordinates_ = dst_coordinates;
8833 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
8834 Kokkos::ViewAllocateWithoutInitializing(
"mj_weights"),
8835 num_incoming_gnos, this->num_weights_per_coord);
8836 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
8838 auto host_src_weights = Kokkos::create_mirror_view_and_copy(
8839 Kokkos::HostSpace(), this->mj_weights);
8842 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
8843 Kokkos::ViewAllocateWithoutInitializing(
"send_weight_buffer"),
8844 this->num_local_coords);
8846 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
8847 Kokkos::ViewAllocateWithoutInitializing(
"received_weight_buffer"),
8850 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
8852 auto sub_host_src_weights
8853 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
8854 auto sub_host_dst_weights
8855 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
8863 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
8864 sent_weight[n] = sub_host_src_weights(n);
8867 distributor.doPostsAndWaits(sent_weight, 1, received_weight);
8870 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
8871 sub_host_dst_weights(n) = received_weight[n];
8874 Kokkos::deep_copy(dst_weights, host_dst_weights);
8875 result_mj_weights_ = dst_weights;
8879 Kokkos::View<int*, Kokkos::HostSpace> sent_owners(
8880 Kokkos::ViewAllocateWithoutInitializing(
"sent_owners"),
8882 Kokkos::deep_copy(sent_owners, myRank);
8884 Kokkos::View<int*, Kokkos::HostSpace> received_owners(
8885 Kokkos::ViewAllocateWithoutInitializing(
"received_owners"),
8888 distributor.doPostsAndWaits(sent_owners, 1, received_owners);
8890 result_actual_owner_rank_ =
new int[num_incoming_gnos];
8892 result_actual_owner_rank_,
8893 received_owners.data(),
8894 num_incoming_gnos *
sizeof(
int));
8897 mj_env_->timerStop(MACRO_TIMERS,
8898 timer_base_string +
"PreMigration DistributorMigration");
8899 return am_i_a_receiver;
8909template <
typename Adapter>
8918 int execute_counter =
8919 Zoltan2_AlgMJ_TrackCallsCounter::get_counter_Zoltan2_AlgMJ();
8920 timer_base_string =
"partition(" + std::to_string(execute_counter) +
") - ";
8922 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"all");
8924 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"setup");
8926 this->set_up_partitioning_data(solution);
8928 this->set_input_parameters(this->mj_env->getParameters());
8929 if(this->mj_keep_part_boxes) {
8930 this->mj_partitioner.set_to_keep_part_boxes();
8933 this->mj_partitioner.set_partitioning_parameters(
8934 this->distribute_points_on_cut_lines,
8935 this->max_concurrent_part_calculation,
8936 this->check_migrate_avoid_migration_option,
8937 this->minimum_migration_imbalance, this->migration_type);
8939 RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
8940 mj_lno_t result_num_local_coords = this->num_local_coords;
8941 Kokkos::View<mj_gno_t*, device_t> result_initial_mj_gnos;
8943 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8944 result_mj_coordinates = this->mj_coordinates;
8945 Kokkos::View<mj_scalar_t**, device_t> result_mj_weights =
8947 int *result_actual_owner_rank = NULL;
8949 Kokkos::View<const mj_gno_t*, device_t> result_initial_mj_gnos_ =
8950 this->initial_mj_gnos;
8968 int current_world_size = this->mj_problemComm->getSize();
8969 mj_lno_t threshold_num_local_coords =
8970 this->min_coord_per_rank_for_premigration;
8971 bool is_pre_migrated =
false;
8972 bool am_i_in_subset =
true;
8978 if(mj_premigration_option > 0 &&
8979 size_t (current_world_size) > this->num_global_parts &&
8980 this->num_global_coords < mj_gno_t (
8981 current_world_size * threshold_num_local_coords))
8983 if(this->mj_keep_part_boxes) {
8984 throw std::logic_error(
"Multijagged: mj_keep_part_boxes and "
8985 "mj_premigration_option are not supported together yet.");
8988 is_pre_migrated =
true;
8989 int migration_selection_option = mj_premigration_option;
8990 if(migration_selection_option * this->num_global_parts >
8991 (
size_t) (current_world_size)) {
8992 migration_selection_option =
8993 current_world_size / this->num_global_parts;
8996 int used_num_ranks = int (this->num_global_coords /
8997 float (threshold_num_local_coords) + 0.5);
8999 if(used_num_ranks == 0) {
9003 am_i_in_subset = this->mj_premigrate_to_subset(
9005 migration_selection_option,
9007 this->mj_problemComm,
9009 this->num_local_coords,
9010 this->num_global_coords,
9011 this->num_global_parts,
9012 this->initial_mj_gnos,
9013 this->mj_coordinates,
9014 this->num_weights_per_coord,
9018 result_num_local_coords,
9019 result_initial_mj_gnos,
9020 result_mj_coordinates,
9022 result_actual_owner_rank);
9024 result_initial_mj_gnos_ = result_initial_mj_gnos;
9027 Kokkos::View<mj_part_t *, device_t> result_assigned_part_ids;
9028 Kokkos::View<mj_gno_t*, device_t> result_mj_gnos;
9030 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"setup");
9032 if(am_i_in_subset) {
9033 this->mj_partitioner.multi_jagged_part(
9036 this->imbalance_tolerance,
9038 this->num_global_parts,
9039 this->part_no_array,
9040 this->recursion_depth,
9042 result_num_local_coords,
9043 this->num_global_coords,
9044 result_initial_mj_gnos_,
9045 result_mj_coordinates,
9046 this->num_weights_per_coord,
9047 this->mj_uniform_weights,
9049 this->mj_uniform_parts,
9050 result_assigned_part_ids,
9055 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"cleanup");
9058 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
9059 localGidToLid.reserve(result_num_local_coords);
9060 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_result_initial_mj_gnos(
9061 Kokkos::ViewAllocateWithoutInitializing(
"host_result_initial_mj_gnos"),
9062 result_initial_mj_gnos_.size());
9063 Kokkos::deep_copy(host_result_initial_mj_gnos, result_initial_mj_gnos_);
9064 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9065 localGidToLid[host_result_initial_mj_gnos(i)] = i;
9068 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[result_num_local_coords],
9069 0, result_num_local_coords,
true);
9070 auto host_result_assigned_part_ids =
9071 Kokkos::create_mirror_view(result_assigned_part_ids);
9072 Kokkos::deep_copy(host_result_assigned_part_ids, result_assigned_part_ids);
9073 auto host_result_mj_gnos = Kokkos::create_mirror_view(result_mj_gnos);
9074 Kokkos::deep_copy(host_result_mj_gnos, result_mj_gnos);
9075 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9076 mj_lno_t origLID = localGidToLid[host_result_mj_gnos(i)];
9077 partId[origLID] = host_result_assigned_part_ids(i);
9082 if(is_pre_migrated) {
9083 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
9084 "PostMigration DistributorPlanCreating");
9085 Tpetra::Distributor distributor(this->mj_problemComm);
9087 ArrayView<const mj_part_t> actual_owner_destinations(
9088 result_actual_owner_rank , result_num_local_coords);
9090 mj_lno_t num_incoming_gnos = distributor.createFromSends(
9091 actual_owner_destinations);
9093 if(num_incoming_gnos != this->num_local_coords) {
9094 throw std::logic_error(
"Zoltan2 - Multijagged Post Migration - "
9095 "num incoming is not equal to num local coords");
9099 "PostMigration DistributorPlanCreating");
9101 "PostMigration DistributorMigration");
9103 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
9104 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
9106 Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
9107 Kokkos::ViewAllocateWithoutInitializing(
"received_partids"),
9110 distributor.doPostsAndWaits(host_result_initial_mj_gnos, 1,
9113 Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partnos;
9114 if (partId.size() > 0) {
9115 sent_partnos = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9116 partId.getRawPtr(), partId.size());
9118 distributor.doPostsAndWaits(sent_partnos, 1, received_partids);
9121 partId = arcp(
new mj_part_t[this->num_local_coords],
9122 0, this->num_local_coords,
true);
9125 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
9126 localGidToLid2.reserve(this->num_local_coords);
9127 auto host_initial_mj_gnos =
9128 Kokkos::create_mirror_view(this->initial_mj_gnos);
9129 Kokkos::deep_copy(host_initial_mj_gnos,
9130 this->initial_mj_gnos);
9131 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9132 localGidToLid2[host_initial_mj_gnos(i)] = i;
9135 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9136 mj_lno_t origLID = localGidToLid2[received_gnos[i]];
9137 partId[origLID] = received_partids[i];
9142 delete [] result_actual_owner_rank;
9145 timer_base_string +
"PostMigration DistributorMigration");
9147 solution->setParts(partId);
9148 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"cleanup");
9151 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"all");
9154 this->mj_coordinates = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>();
9159template <
typename Adapter>
9172 int criteria_dim = (this->num_weights_per_coord ?
9173 this->num_weights_per_coord : 1);
9177 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
9180 this->mj_uniform_parts = Kokkos::View<bool *, Kokkos::HostSpace>(
9181 "uniform parts", criteria_dim);
9182 this->mj_uniform_weights = Kokkos::View<bool *, Kokkos::HostSpace>(
9183 "uniform weights", criteria_dim);
9185 Kokkos::View<const mj_gno_t *, device_t> gnos;
9186 Kokkos::View<adapter_scalar_t **, Kokkos::LayoutLeft, device_t> xyz_adapter;
9188 Kokkos::View<adapter_scalar_t **, device_t> wgts_adapter;
9191 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> xyz;
9192 Kokkos::View<mj_scalar_t **, device_t> wgts;
9196 if(std::is_same<mj_scalar_t, adapter_scalar_t>()) {
9199 assign_if_same(xyz, xyz_adapter);
9200 assign_if_same(wgts, wgts_adapter);
9205 xyz = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
9206 (Kokkos::ViewAllocateWithoutInitializing(
9207 "xyz"), xyz_adapter.extent(0), xyz_adapter.extent(1));
9208 wgts = Kokkos::View<mj_scalar_t **, device_t>(
9209 Kokkos::ViewAllocateWithoutInitializing(
"wgts"),
9210 wgts_adapter.extent(0), wgts_adapter.extent(1));
9212 typedef typename Kokkos::View<mj_scalar_t **, device_t>::size_type view_size_t;
9213 Kokkos::parallel_for(
9214 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9215 (0, xyz_adapter.extent(0)), KOKKOS_LAMBDA (
int i) {
9216 for(view_size_t n = 0; n < xyz_adapter.extent(1); ++n) {
9217 xyz(i, n) =
static_cast<mj_scalar_t
>(xyz_adapter(i, n));
9220 Kokkos::parallel_for(
9221 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9222 (0, wgts.extent(0)), KOKKOS_LAMBDA (
int i) {
9223 for(view_size_t n = 0; n < wgts.extent(1); ++n) {
9224 wgts(i, n) =
static_cast<mj_scalar_t
>(wgts_adapter(i, n));
9230 this->initial_mj_gnos = gnos;
9232 this->mj_coordinates = xyz;
9235 if(this->num_weights_per_coord == 0) {
9236 this->mj_uniform_weights(0) =
true;
9237 Kokkos::resize(this->mj_weights, 0, 0);
9240 this->mj_weights = wgts;
9241 for(
int wdim = 0; wdim < this->num_weights_per_coord; ++wdim) {
9242 this->mj_uniform_weights(wdim) =
false;
9246 for(
int wdim = 0; wdim < criteria_dim; ++wdim) {
9247 if(solution->criteriaHasUniformPartSizes(wdim)) {
9248 this->mj_uniform_parts(wdim) =
true;
9251 printf(
"Error: MJ does not support non uniform target part weights\n");
9260template <
typename Adapter>
9262 const Teuchos::ParameterList &pl)
9264 const Teuchos::ParameterEntry *pe = pl.getEntryPtr(
"imbalance_tolerance");
9267 tol = pe->getValue(&tol);
9268 this->imbalance_tolerance = tol - 1.0;
9272 if(this->imbalance_tolerance <= 0) {
9273 this->imbalance_tolerance= 10e-4;
9277 Kokkos::resize(this->part_no_array, 0);
9280 this->recursion_depth = 0;
9282 if(pl.getPtr<
int>(
"mj_num_teams")) {
9283 this->num_teams = pl.get<
int>(
"mj_num_teams");
9286 if(pl.getPtr<Array <mj_part_t> >(
"mj_parts")) {
9287 auto mj_parts = pl.get<Array <mj_part_t> >(
"mj_parts");
9288 int mj_parts_size =
static_cast<int>(mj_parts.size());
9291 this->part_no_array = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9292 "part_no_array", mj_parts_size);
9293 for(
int i = 0; i < mj_parts_size; ++i) {
9294 this->part_no_array(i) = mj_parts.getRawPtr()[i];
9297 this->recursion_depth = mj_parts_size - 1;
9298 this->mj_env->debug(2,
"mj_parts provided by user");
9302 this->distribute_points_on_cut_lines =
true;
9303 this->max_concurrent_part_calculation = 1;
9305 this->mj_run_as_rcb =
false;
9306 this->mj_premigration_option = 0;
9307 this->min_coord_per_rank_for_premigration = 32000;
9309 int mj_user_recursion_depth = -1;
9310 this->mj_keep_part_boxes =
false;
9311 this->check_migrate_avoid_migration_option = 0;
9312 this->migration_type = 0;
9313 this->minimum_migration_imbalance = 0.35;
9315 pe = pl.getEntryPtr(
"mj_minimum_migration_imbalance");
9318 imb = pe->getValue(&imb);
9319 this->minimum_migration_imbalance = imb - 1.0;
9322 pe = pl.getEntryPtr(
"mj_migration_option");
9324 this->check_migrate_avoid_migration_option =
9325 pe->getValue(&this->check_migrate_avoid_migration_option);
9327 this->check_migrate_avoid_migration_option = 0;
9329 if(this->check_migrate_avoid_migration_option > 1) {
9330 this->check_migrate_avoid_migration_option = -1;
9334 pe = pl.getEntryPtr(
"mj_migration_type");
9336 this->migration_type = pe->getValue(&this->migration_type);
9338 this->migration_type = 0;
9344 pe = pl.getEntryPtr(
"mj_concurrent_part_count");
9346 this->max_concurrent_part_calculation =
9347 pe->getValue(&this->max_concurrent_part_calculation);
9349 this->max_concurrent_part_calculation = 1;
9352 pe = pl.getEntryPtr(
"mj_keep_part_boxes");
9354 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
9356 this->mj_keep_part_boxes =
false;
9367 if(this->mj_keep_part_boxes ==
false) {
9368 pe = pl.getEntryPtr(
"mapping_type");
9370 int mapping_type = -1;
9371 mapping_type = pe->getValue(&mapping_type);
9372 if(mapping_type == 0) {
9373 mj_keep_part_boxes =
true;
9379 pe = pl.getEntryPtr(
"mj_enable_rcb");
9381 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
9383 this->mj_run_as_rcb =
false;
9386 pe = pl.getEntryPtr(
"mj_premigration_option");
9388 mj_premigration_option = pe->getValue(&mj_premigration_option);
9390 mj_premigration_option = 0;
9393 pe = pl.getEntryPtr(
"mj_premigration_coordinate_count");
9395 min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
9397 min_coord_per_rank_for_premigration = 32000;
9400 pe = pl.getEntryPtr(
"mj_recursion_depth");
9402 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
9404 mj_user_recursion_depth = -1;
9408 pe = pl.getEntryPtr(
"rectilinear");
9410 val = pe->getValue(&val);
9413 this->distribute_points_on_cut_lines =
false;
9415 this->distribute_points_on_cut_lines =
true;
9418 if(this->mj_run_as_rcb) {
9419 mj_user_recursion_depth =
9420 (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
9422 if(this->recursion_depth < 1) {
9423 if(mj_user_recursion_depth > 0) {
9424 this->recursion_depth = mj_user_recursion_depth;
9427 this->recursion_depth = this->coord_dim;
9433template <
typename Adapter>
9436 adapter_scalar_t *lower,
9437 adapter_scalar_t *upper,
9438 size_t &nPartsFound,
9439 typename Adapter::part_t **partsFound)
const
9448 if(this->mj_keep_part_boxes) {
9451 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9453 size_t nBoxes = (*partBoxes).size();
9455 throw std::logic_error(
"no part boxes exist");
9459 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9461 if(globalBox->boxesOverlap(dim, lower, upper)) {
9463 std::vector<typename Adapter::part_t> partlist;
9466 for(
size_t i = 0; i < nBoxes; i++) {
9468 if((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
9470 partlist.push_back((*partBoxes)[i].getpId());
9492 *partsFound =
new mj_part_t[nPartsFound];
9493 for(
size_t i = 0; i < nPartsFound; i++)
9494 (*partsFound)[i] = partlist[i];
9506 throw std::logic_error(
"need to use keep_cuts parameter for boxAssign");
9511template <
typename Adapter>
9514 adapter_scalar_t *point)
const
9520 if(this->mj_keep_part_boxes) {
9521 typename Adapter::part_t foundPart = -1;
9524 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9526 size_t nBoxes = (*partBoxes).size();
9528 throw std::logic_error(
"no part boxes exist");
9532 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9534 if(globalBox->pointInBox(dim, point)) {
9538 for(i = 0; i < nBoxes; i++) {
9540 if((*partBoxes)[i].pointInBox(dim, point)) {
9541 foundPart = (*partBoxes)[i].getpId();
9555 std::ostringstream oss;
9557 for(
int j = 0; j < dim; j++) oss << point[j] <<
" ";
9558 oss <<
") not found in domain";
9559 throw std::logic_error(oss.str());
9569 size_t closestBox = 0;
9570 coord_t minDistance = std::numeric_limits<coord_t>::max();
9571 coord_t *centroid =
new coord_t[dim];
9572 for(
size_t i = 0; i < nBoxes; i++) {
9573 (*partBoxes)[i].computeCentroid(centroid);
9576 for(
int j = 0; j < dim; j++) {
9577 diff = centroid[j] - point[j];
9580 if(sum < minDistance) {
9585 foundPart = (*partBoxes)[closestBox].getpId();
9592 throw std::logic_error(
"need to use keep_cuts parameter for pointAssign");
9596template <
typename Adapter>
9602 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL) {
9603 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
9604 mj_part_t ntasks = (*pBoxes).size();
9605 int dim = (*pBoxes)[0].getDim();
9606 GridHash grid(pBoxes, ntasks, dim);
9613template <
typename Adapter>
9614RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
9617 return this->mj_partitioner.get_kept_boxes();
Defines the CoordinateModel classes.
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
Define IntegerRangeList validator.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Defines Parameter related enumerators, declares functions.
A gathering of useful namespace methods.
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Zoltan2_BoxBoundaries()
Default Constructor.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Multi Jagged coordinate partitioning algorithm.
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
DOCWORK: Documentation.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, int num_teams, size_t num_global_parts, Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, Kokkos::View< const mj_gno_t *, device_t > &initial_mj_gnos, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates, int num_weights_per_coord, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_weights, Kokkos::View< mj_scalar_t **, device_t > &mj_weights, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_parts, Kokkos::View< mj_part_t *, device_t > &result_assigned_part_ids, Kokkos::View< mj_gno_t *, device_t > &result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > get_kept_boxes() const
DOCWORK: Documentation.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
RCP< mj_partBox_t > get_global_box() const
DOCWORK: Documentation.
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
Algorithm defines the base class for all algorithms.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
global_size_t getGlobalNumCoordinates() const
Returns the global number coordinates.
size_t getCoordinatesKokkos(Kokkos::View< const gno_t *, typename node_t::device_type > &Ids, Kokkos::View< scalar_t **, Kokkos::LayoutLeft, typename node_t::device_type > &xyz, Kokkos::View< scalar_t **, typename node_t::device_type > &wgts) const
Returns the coordinate ids, values and optional weights.
int getCoordinateDim() const
Returns the dimension of the coordinates.
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
int getNumWeightsPerCoordinate() const
Returns the number (0 or greater) of weights per coordinate.
GridHash Class, Hashing Class for part boxes.
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
A ParameterList validator for integer range lists.
A PartitioningSolution is a solution to a partitioning problem.
Multi Jagged coordinate partitioning algorithm.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const typename Adapter::base_adapter_t > &adapter)
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
Class for sorting items with multiple values. First sorting with respect to val[0],...
void set(IT index_, CT count_, WT *vals_)
bool operator<(const uMultiSortItem< IT, CT, WT > &other) const
uMultiSortItem(IT index_, CT count_, WT *vals_)
Created by mbenlioglu on Aug 31, 2020.
Tpetra::global_size_t global_size_t
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
@ MACRO_TIMERS
Time an algorithm (or other entity) as a whole.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals....
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals.
SparseMatrixAdapter_t::part_t part_t
KOKKOS_INLINE_FUNCTION value_type & reference() const
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
int value_count_rightleft
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(scalar_t mj_max_scalar, value_type &val, int mj_value_count_rightleft, int mj_value_count_weights)
ArrayCombinationReducer reducer
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
KOKKOS_INLINE_FUNCTION ArrayReducer(value_type &val, int mj_value_count)
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
KOKKOS_INLINE_FUNCTION value_type & reference() const
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< part_t *, device_t > parts
Kokkos::View< scalar_t * > scalar_view_t
policy_t::member_type member_type
Kokkos::View< index_t *, device_t > part_xadj
ReduceArrayFunctor(part_t mj_concurrent_current_part, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< index_t *, device_t > &mj_part_xadj, Kokkos::View< index_t *, device_t > &mj_track_on_cuts)
Kokkos::View< index_t *, device_t > track_on_cuts
Kokkos::View< scalar_t *, device_t > coordinates
part_t concurrent_current_part
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
Kokkos::View< scalar_t *, device_t > cut_coordinates
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
part_t concurrent_current_part
Kokkos::View< scalar_t **, device_t > weights
ReduceWeightsFunctor(int mj_loop_count, array_t mj_max_scalar, part_t mj_concurrent_current_part, part_t mj_num_cuts, part_t mj_current_work_part, part_t mj_current_concurrent_num_parts, part_t mj_left_right_array_size, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< scalar_t **, device_t > &mj_weights, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< scalar_t *, device_t > &mj_cut_coordinates, Kokkos::View< index_t *, device_t > &mj_part_xadj, bool mj_uniform_weights0, scalar_t mj_sEpsilon)
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
part_t current_concurrent_num_parts
Kokkos::View< scalar_t *, device_t > coordinates
Kokkos::View< part_t *, device_t > parts
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > part_xadj
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void operator()(const member_type &teamMember, value_type teamSum) const
Kokkos::View< scalar_t * > scalar_view_t
policy_t::member_type member_type
int value_count_rightleft
static int get_counter_Zoltan2_AlgMJ()
static int get_counter_AlgMJ()
Zoltan2_MJArrayType< scalar_t > & operator=(const volatile Zoltan2_MJArrayType< scalar_t > &zmj)
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType()
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType(scalar_t *pSetPtr)
bool operator<=(const uSignedSortItem< IT, WT, SIGN > &rhs)
bool operator<(const uSignedSortItem< IT, WT, SIGN > &rhs) const
Sort items for quick sort function.