11 #include <cusp/csr_matrix.h>
12 #include <cusp/coo_matrix.h>
13 #include <cusp/blas.h>
14 #include <cusp/print.h>
16 #include <thrust/scan.h>
17 #include <thrust/functional.h>
18 #include <thrust/sequence.h>
19 #include <thrust/iterator/zip_iterator.h>
20 #include <thrust/gather.h>
74 EdgeT(
int ori_idx,
int from,
int to, T val)
93 typedef typename cusp::coo_matrix<int, T, cusp::host_memory>
MatrixCoo;
94 typedef typename cusp::array1d<T, cusp::host_memory>
Vector;
95 typedef typename cusp::array1d<double, cusp::host_memory>
DoubleVector;
96 typedef typename cusp::array1d<int, cusp::host_memory>
IntVector;
97 typedef typename cusp::array1d<bool, cusp::host_memory>
BoolVector;
109 Graph(
bool trackReordering =
false);
173 bool m_trackReordering;
177 double m_timeDropoff;
181 bool MC64(
bool scale,
201 std::vector<int>* in_out_graph);
203 static const double LOC_INFINITY;
206 void find_minimum_match(
IntVector& mc64RowPerm,
209 void init_reduced_cval(
IntVector& row_ptr,
218 bool find_shortest_aug_path(
int init_node,
232 struct CompareEdgeLength
240 struct AccumulateEdgeValue
242 T operator()(T res,
const EdgeT<T>& edge)
244 return res + std::abs(edge.m_val);
248 struct EdgeLength :
public thrust::unary_function<EdgeT<T>, int>
251 int operator() (
const EdgeT<T>& a)
253 return std::abs(a.m_from - a.m_to);
257 struct PermutedEdgeLength :
public thrust::unary_function<EdgeT<T>, int>
261 PermutedEdgeLength(
int* perm): m_perm(perm) {}
264 int operator() (
const EdgeT<T>& a)
266 return std::abs(m_perm[a.m_from] - m_perm[a.m_to]);
274 PermuteEdge(
int* perm): m_perm(perm) {}
277 void operator() (EdgeT<T>& a)
279 a.m_from = m_perm[a.m_from];
280 a.m_to = m_perm[a.m_to];
284 struct Exponential:
public thrust::unary_function<double, double>
287 double operator() (
double a)
296 template <
typename T>
297 const double Graph<T>::LOC_INFINITY = 1e37;
305 template <
typename T>
310 m_trackReordering(trackReordering)
322 template <
typename T>
336 m_nnz = Acoo.num_entries;
339 if (m_trackReordering) {
340 for (
int i = 0; i < m_nnz; i++) {
341 m_edges.push_back(
EdgeType(i, Acoo.row_indices[i], Acoo.column_indices[i], (T)Acoo.values[i]));
344 for (
int i = 0; i < m_nnz; i++)
345 m_edges.push_back(
EdgeType(Acoo.row_indices[i], Acoo.column_indices[i], (T)Acoo.values[i]));
356 MC64(scale, mc64RowPerm, mc64RowScaleD, mc64ColScaleD, scaleMap);
357 mc64RowScale = mc64RowScaleD;
358 mc64ColScale = mc64ColScaleD;
360 mc64RowScale.resize(m_n);
361 mc64ColScale.resize(m_n);
362 mc64RowPerm.resize(m_n);
363 scaleMap.resize(m_nnz);
365 thrust::sequence(mc64RowPerm.begin(), mc64RowPerm.end());
366 cusp::blas::fill(mc64RowScale, (T) 1.0);
367 cusp::blas::fill(mc64ColScale, (T) 1.0);
368 cusp::blas::fill(scaleMap, (T) 1.0);
372 for (
EdgeIterator edgeIt = m_edges.begin(); edgeIt != m_edges.end(); edgeIt++)
373 if (k_mc64 < abs(edgeIt->m_from - edgeIt->m_to))
374 k_mc64 = abs(edgeIt->m_from - edgeIt->m_to);
379 int bandwidth = RCM(m_edges, optReordering, optPerm);
382 m_first = m_edges.begin();
406 template <
typename T>
417 std::sort(m_edges.begin(), m_edges.end(), CompareEdgeLength());
422 T norm_in = std::accumulate(m_edges.begin(), m_edges.end(), (T) 0, AccumulateEdgeValue());
423 T min_norm_out = (1 - frac) * norm_in;
424 T norm_out = norm_in;
430 m_first = m_edges.begin();
436 int bandwidth = abs(m_first->m_from - m_first->m_to);
443 do {last++;}
while(abs(last->m_from - last->m_to) == bandwidth);
445 T band_norm = std::accumulate(m_first, last, (T) 0, AccumulateEdgeValue());
449 if (bandwidth <= maxBandwidth && norm_out - band_norm < min_norm_out)
453 norm_out -= band_norm;
462 frac_actual = 1 - norm_out/norm_in;
474 template <
typename T>
488 if (WV_host.size() != 2*bandwidth*bandwidth*(numPartitions-1)) {
489 WV_host.resize(2*bandwidth*bandwidth*(numPartitions-1), 0);
490 offDiags_host.resize(2*bandwidth*bandwidth*(numPartitions-1), 0);
491 offDiagWidths_left.resize(numPartitions-1, 0);
492 offDiagWidths_right.resize(numPartitions-1, 0);
495 cusp::blas::fill(WV_host, (T) 0);
496 cusp::blas::fill(offDiags_host, (T) 0);
497 cusp::blas::fill(offDiagWidths_left, 0);
498 cusp::blas::fill(offDiagWidths_right, 0);
501 offDiagPerms_left.resize((numPartitions-1) * bandwidth, -1);
502 offDiagPerms_right.resize((numPartitions-1) * bandwidth, -1);
504 IntVector offDiagReorderings_left((numPartitions-1) * bandwidth, -1);
505 IntVector offDiagReorderings_right((numPartitions-1) * bandwidth, -1);
509 int partSize = m_n / numPartitions;
510 int remainder = m_n % numPartitions;
512 m_major_edges.clear();
514 if (m_trackReordering) {
515 typeMap.resize(m_nnz);
516 offDiagMap.resize(m_nnz);
520 m_exists.resize(m_n);
521 cusp::blas::fill(m_exists,
false);
523 for (
EdgeIterator it = first; it != m_edges.end(); ++it) {
526 int curPartNum = l / (partSize + 1);
527 if (curPartNum >= remainder)
528 curPartNum = remainder + (l-remainder * (partSize + 1)) / partSize;
530 int curPartNum2 = j / (partSize + 1);
531 if (curPartNum2 >= remainder)
532 curPartNum2 = remainder + (j-remainder * (partSize + 1)) / partSize;
534 if (curPartNum == curPartNum2)
535 m_major_edges.push_back(*it);
537 if (curPartNum > curPartNum2) {
539 int partEndRow = partSize * curPartNum2;
540 if (curPartNum2 < remainder)
541 partEndRow += curPartNum2 + partSize + 1;
543 partEndRow += remainder + partSize;
545 int partStartCol = partSize * curPartNum;
546 if (curPartNum < remainder)
547 partStartCol += curPartNum;
549 partStartCol += remainder;
551 if (offDiagReorderings_right[curPartNum2*bandwidth+l-partStartCol] < 0) {
552 offDiagReorderings_right[curPartNum2*bandwidth+l-partStartCol] = offDiagWidths_right[curPartNum2];
553 offDiagPerms_right[curPartNum2*bandwidth+offDiagWidths_right[curPartNum2]] = l-partStartCol;
554 offDiagWidths_right[curPartNum2]++;
557 if (m_trackReordering) {
558 typeMap[it->m_ori_idx] = 0;
559 offDiagMap[it->m_ori_idx] = curPartNum2*2*bandwidth*bandwidth + (j+bandwidth-partEndRow) * bandwidth + (l-partStartCol);
560 WVMap[it->m_ori_idx] = curPartNum2*2*bandwidth*bandwidth + (j+bandwidth-partEndRow) + offDiagReorderings_right[curPartNum2*bandwidth+l-partStartCol] * bandwidth;
563 offDiags_host[curPartNum2*2*bandwidth*bandwidth + (j+bandwidth-partEndRow) * bandwidth + (l-partStartCol)] = WV_host[curPartNum2*2*bandwidth*bandwidth + (j+bandwidth-partEndRow) + offDiagReorderings_right[curPartNum2*bandwidth+l-partStartCol] * bandwidth] = it->m_val;
566 int partStartRow = partSize * curPartNum2;
567 if (curPartNum2 < remainder)
568 partStartRow += curPartNum2;
570 partStartRow += remainder;
572 int partEndCol = partSize * curPartNum;
573 if (curPartNum < remainder)
574 partEndCol += curPartNum + partSize + 1;
576 partEndCol += remainder + partSize;
578 if (offDiagReorderings_left[(curPartNum2-1)*bandwidth+l-partEndCol+bandwidth] < 0) {
579 offDiagReorderings_left[(curPartNum2-1)*bandwidth+l-partEndCol+bandwidth] = bandwidth - 1 - offDiagWidths_left[curPartNum2-1];
580 offDiagPerms_left[(curPartNum2-1)*bandwidth+bandwidth-1-offDiagWidths_left[curPartNum2-1]] = l-partEndCol+bandwidth;
581 offDiagWidths_left[curPartNum2-1]++;
584 if (m_trackReordering) {
585 typeMap[it->m_ori_idx] = 0;
586 offDiagMap[it->m_ori_idx] = (curPartNum*2+1)*bandwidth*bandwidth + (j-partStartRow) * bandwidth + (l-partEndCol+bandwidth);
587 WVMap[it->m_ori_idx] = (curPartNum*2+1)*bandwidth*bandwidth + (j-partStartRow) + (offDiagReorderings_left[(curPartNum2-1)*bandwidth+l-partEndCol+bandwidth]) * bandwidth;
590 offDiags_host[(curPartNum*2+1)*bandwidth*bandwidth + (j-partStartRow) * bandwidth + (l-partEndCol+bandwidth)] = WV_host[(curPartNum*2+1)*bandwidth*bandwidth + (j-partStartRow) + (offDiagReorderings_left[(curPartNum2-1)*bandwidth+l-partEndCol+bandwidth]) * bandwidth] = it->m_val;
604 template <
typename T>
613 secondPerm.resize(m_n);
614 cusp::blas::fill(secondPerm, 0);
615 for (
EdgeIterator edgeIt = m_major_edges.begin(); edgeIt != m_major_edges.end(); edgeIt++)
616 secondPerm[edgeIt -> m_to]++;
617 thrust::inclusive_scan(secondPerm.begin(), secondPerm.end(), secondPerm.begin());
619 for (
EdgeRevIterator edgeIt = m_major_edges.rbegin(); edgeIt != m_major_edges.rend(); edgeIt++)
620 tmp_edges[--secondPerm[edgeIt->m_to]] = *edgeIt;
621 m_major_edges = tmp_edges;
624 int node_begin = 0, node_end;
625 int partSize = m_n / numPartitions;
626 int remainder = m_n % numPartitions;
627 EdgeIterator edgeBegin = m_major_edges.begin(), edgeEnd;
628 secondReorder.resize(m_n);
630 for (
int i = 0; i < numPartitions; i++) {
632 node_end = node_begin + partSize + 1;
634 node_end = node_begin + partSize;
636 for (edgeEnd = edgeBegin; edgeEnd != m_major_edges.end(); edgeEnd++) {
637 if (edgeEnd->m_from >= node_end)
641 partitionedRCM(edgeBegin,
648 node_begin = node_end;
652 first_rows.resize(numPartitions - 1);
655 for (
int i=0; i<numPartitions - 1; i++) {
657 last_row += (partSize + 1);
659 last_row += partSize;
661 first_rows[i] = last_row;
662 for (
int j = last_row-bandwidth; j < last_row; j++)
663 if (m_exists[j] && first_rows[i] > secondPerm[j])
664 first_rows[i] = secondPerm[j];
677 template <
typename T>
688 size_t Bsize = (size_t) (2 * bandwidth + 1) * m_n;
691 ks_col.resize(m_n, 0);
692 ks_row.resize(m_n, 0);
694 if (m_trackReordering) {
695 if (typeMap.size() <= 0)
696 typeMap.resize(m_nnz);
697 bandedMatMap.resize(m_nnz);
700 if (m_trackReordering) {
701 for (
EdgeIterator it = m_first; it != m_edges.end(); ++it) {
705 size_t i = (size_t) l * (2 * bandwidth + 1) + bandwidth + j - l;
707 typeMap[it->m_ori_idx] = 1;
708 bandedMatMap[it->m_ori_idx] = i;
710 if (ks_col[l] < j - l)
716 for (
EdgeIterator it = m_first; it != m_edges.end(); ++it) {
720 size_t i = (size_t) l * (2 * bandwidth + 1) + bandwidth + j - l;
723 if (ks_col[l] < j - l)
730 for (
int i=1; i<m_n; i++) {
731 if (ks_col[i] < ks_col[i-1] - 1)
732 ks_col[i] = ks_col[i-1] - 1;
733 if (ks_row[i] < ks_row[i-1] - 1)
734 ks_row[i] = ks_row[i-1] - 1;
738 template <
typename T>
750 ks.resize(numPartitions, 0);
751 BOffsets.resize(numPartitions + 1);
755 int partSize = m_n / numPartitions;
756 int remainder = m_n % numPartitions;
758 EdgeIterator toStart = m_major_edges.begin(), toEnd = m_major_edges.end();
763 int curPartNum = l / (partSize + 1);
764 if (curPartNum >= remainder)
765 curPartNum = remainder + (l-remainder * (partSize + 1)) / partSize;
766 if (ks[curPartNum] < abs(l-j))
767 ks[curPartNum] = abs(l-j);
770 for (
int i=0; i < numPartitions; i++) {
772 BOffsets[i+1] = BOffsets[i] + (partSize + 1) * (2 * ks[i] + 1);
774 BOffsets[i+1] = BOffsets[i] + (partSize) * (2 * ks[i] + 1);
777 B.resize(BOffsets[numPartitions]);
779 if (m_trackReordering) {
780 if (typeMap.size() <= 0)
781 typeMap.resize(m_nnz);
782 bandedMatMap.resize(m_nnz);
785 ks_col.resize(m_n, 0);
786 ks_row.resize(m_n, 0);
792 int curPartNum = l / (partSize + 1);
794 if (curPartNum >= remainder) {
795 l_in_part = l - remainder * (partSize + 1);
796 curPartNum = remainder + l_in_part / partSize;
797 l_in_part %= partSize;
799 l_in_part = l % (partSize + 1);
802 int K = ks[curPartNum];
803 int i = BOffsets[curPartNum] + l_in_part * (2 * K + 1) + K + j - l;
806 if (ks_col[l] < j - l)
811 if (m_trackReordering) {
812 typeMap[it->m_ori_idx] = 1;
813 bandedMatMap[it->m_ori_idx] = i;
817 int partBegin = 0, partEnd = partSize;
818 for (
int i=0; i<numPartitions; i++) {
821 for (
int j = partBegin+1; j < partEnd; j++) {
822 if (ks_col[j] < ks_col[j-1] - 1)
823 ks_col[j] = ks_col[j-1] - 1;
824 if (ks_col[j] > partEnd - j - 1)
825 ks_col[j] = partEnd - j - 1;
827 if (ks_row[j] < ks_row[j-1] - 1)
828 ks_row[j] = ks_row[j-1] - 1;
829 if (ks_row[j] > partEnd - j - 1)
830 ks_row[j] = partEnd - j - 1;
833 partEnd = partBegin + partSize;
843 template <
typename T>
846 IntVector& mc64RowPerm,
847 DoubleVector& mc64RowScale,
848 DoubleVector& mc64ColScale,
849 MatrixMapF& scaleMap)
851 find_minimum_match(mc64RowPerm, mc64RowScale, mc64ColScale);
853 if (m_trackReordering)
854 scaleMap.resize(m_nnz);
857 for (EdgeIterator iter = m_edges.begin(); iter != m_edges.end(); iter++) {
858 int from = iter->m_from;
860 T scaleFact = (T)(mc64RowScale[from] * mc64ColScale[to]);
861 if (m_trackReordering)
862 scaleMap[iter->m_ori_idx] = scaleFact;
863 iter->m_val *= scaleFact;
864 iter->m_from = mc64RowPerm[from];
867 for(EdgeIterator iter = m_edges.begin(); iter != m_edges.end(); iter++)
868 iter->m_from = mc64RowPerm[iter->m_from];
870 if (m_trackReordering)
871 cusp::blas::fill(scaleMap, (T) 1.0);
885 template <
typename T>
887 Graph<T>::RCM(EdgeVector& edges,
888 IntVector& optReordering,
891 optReordering.resize(m_n);
894 int nnz = edges.size();
896 IntVector tmp_reordering(m_n);
897 IntVector degrees(m_n, 0);
899 thrust::sequence(optReordering.begin(), optReordering.end());
901 std::vector<int> *in_out_graph;
903 in_out_graph =
new std::vector<int> [m_n];
905 EdgeIterator begin = edges.begin();
906 EdgeIterator end = edges.end();
907 int tmp_bdwidth = thrust::transform_reduce(begin, end, EdgeLength(), 0, thrust::maximum<int>());
908 int bandwidth = tmp_bdwidth;
909 buildTopology(begin, end, degrees, in_out_graph);
911 const int MAX_NUM_TRIAL = 10;
912 const int BANDWIDTH_THRESHOLD = 256;
913 const int BANDWIDTH_MIN_REQUIRED = 10000;
918 BoolVector tried(m_n,
false);
923 for (
int trial_num = 0; trial_num < MAX_NUM_TRIAL || (bandwidth >= BANDWIDTH_MIN_REQUIRED && trial_num < 10*MAX_NUM_TRIAL); trial_num++)
926 std::priority_queue<NodeType> pq;
929 BoolVector pushed(m_n,
false);
936 if (trial_num < MAX_NUM_TRIAL) {
937 tmp_node = rand() % m_n;
939 while(tried[tmp_node])
940 tmp_node = rand() % m_n;
942 if (last_tried >= m_n - 1) {
943 fprintf(stderr,
"All possible starting points have been tried in RCM\n");
946 for (tmp_node = last_tried+1; tmp_node < m_n; tmp_node++)
947 if (!tried[tmp_node]) {
948 last_tried = tmp_node;
953 pushed[tmp_node] =
true;
954 tried[tmp_node] =
true;
963 for(i = last; i < m_n; i++) {
971 if(i < m_n)
continue;
972 fprintf(stderr,
"Can never get here!\n");
976 tmp_node = q.front();
977 tmp_reordering[j] = tmp_node;
982 std::vector<int> &tmp_vec = in_out_graph[tmp_node];
983 int out_size = tmp_vec.size();
985 for (
int i = 0; i < out_size; i++) {
986 int target_node = tmp_vec[i];
987 if(!pushed[target_node]) {
988 pushed[target_node] =
true;
989 pq.push(NodeType(target_node, degrees[target_node]));
995 q.push(pq.top().m_idx);
1000 thrust::scatter(thrust::make_counting_iterator(0),
1001 thrust::make_counting_iterator(m_n),
1002 tmp_reordering.begin(),
1006 int *perm_array = thrust::raw_pointer_cast(&optPerm[0]);
1007 tmp_bdwidth = thrust::transform_reduce(edges.begin(), edges.end(), PermutedEdgeLength(perm_array), 0, thrust::maximum<int>());
1010 if(bandwidth > tmp_bdwidth) {
1011 bandwidth = tmp_bdwidth;
1012 optReordering = tmp_reordering;
1015 if(bandwidth <= BANDWIDTH_THRESHOLD)
1021 m_timeRCM = timer.getElapsed();
1023 thrust::scatter(thrust::make_counting_iterator(0),
1024 thrust::make_counting_iterator(m_n),
1025 optReordering.begin(),
1029 int* perm_array = thrust::raw_pointer_cast(&optPerm[0]);
1030 thrust::for_each(edges.begin(), edges.end(), PermuteEdge(perm_array));
1033 delete [] in_out_graph;
1045 template <
typename T>
1047 Graph<T>::partitionedRCM(EdgeIterator& begin,
1051 IntVector& optReordering,
1054 static std::vector<int> tmp_reordering;
1055 tmp_reordering.resize(m_n);
1057 static IntVector degrees;
1058 if (degrees.size() != m_n)
1059 degrees.resize(m_n, 0);
1061 cusp::blas::fill(degrees, 0);
1065 thrust::sequence(optReordering.begin()+node_begin, optReordering.begin()+node_end, node_begin);
1067 std::vector<int>* in_out_graph;
1069 in_out_graph =
new std::vector<int> [node_end];
1071 int tmp_bdwidth = thrust::transform_reduce(begin, end, EdgeLength(), 0, thrust::maximum<int>());
1072 int opt_bdwidth = tmp_bdwidth;
1073 buildTopology(begin, end, degrees, in_out_graph);
1075 const int MAX_NUM_TRIAL = 10;
1076 const int BANDWIDTH_THRESHOLD = 128;
1078 static BoolVector tried(m_n,
false);
1079 if (tried.size() != m_n)
1080 tried.resize(m_n,
false);
1082 cusp::blas::fill(tried,
false);
1084 tried[node_begin] =
true;
1089 for (
int num_trial = 0; num_trial < MAX_NUM_TRIAL; num_trial++) {
1091 std::priority_queue<NodeType> pq;
1094 BoolVector pushed(node_end,
false);
1096 if (num_trial > 0) {
1097 tmp_node = rand() % (node_end - node_begin) + node_begin;
1098 while (tried[tmp_node])
1099 tmp_node = rand() % (node_end - node_begin) + node_begin;
1100 tried[tmp_node] = pushed[tmp_node] =
true;
1104 int left_cnt = node_end - node_begin;
1105 int j = node_begin, last = node_begin;
1111 for(i = last; i < node_end; i++) {
1119 if(i < node_end)
continue;
1120 fprintf(stderr,
"Can never get here!\n");
1124 tmp_node = q.front();
1125 tmp_reordering[j] = tmp_node;
1130 std::vector<int>& tmp_vec = in_out_graph[tmp_node];
1131 int in_out_size = tmp_vec.size();
1132 if(in_out_size != 0) {
1133 for (
int i = 0; i < in_out_size; i++) {
1134 int target_node = tmp_vec[i];
1135 if(!pushed[target_node]) {
1136 pushed[target_node] =
true;
1137 pq.push(NodeType(target_node, degrees[target_node]));
1142 while(!pq.empty()) {
1143 q.push(pq.top().m_idx);
1148 thrust::scatter(thrust::make_counting_iterator(node_begin),
1149 thrust::make_counting_iterator(node_end),
1150 tmp_reordering.begin() + node_begin,
1154 int *perm_array = thrust::raw_pointer_cast(&optPerm[0]);
1155 tmp_bdwidth = thrust::transform_reduce(begin, end, PermutedEdgeLength(perm_array), 0, thrust::maximum<int>());
1158 if(opt_bdwidth > tmp_bdwidth) {
1159 opt_bdwidth = tmp_bdwidth;
1161 thrust::copy(tmp_reordering.begin()+node_begin, tmp_reordering.begin()+node_end, optReordering.begin()+node_begin);
1164 if(opt_bdwidth <= BANDWIDTH_THRESHOLD)
1169 m_timeRCM += timer.getElapsed();
1171 thrust::scatter(thrust::make_counting_iterator(node_begin),
1172 thrust::make_counting_iterator(node_end),
1173 optReordering.begin() + node_begin,
1177 int* perm_array = thrust::raw_pointer_cast(&optPerm[0]);
1178 thrust::for_each(begin, end, PermuteEdge(perm_array));
1181 delete [] in_out_graph;
1191 template <
typename T>
1193 Graph<T>::buildTopology(EdgeIterator& begin,
1196 std::vector<int>* in_out_graph)
1198 for(EdgeIterator edgeIt = begin; edgeIt != end; edgeIt++) {
1199 int from = edgeIt -> m_from, to = edgeIt -> m_to;
1201 in_out_graph[from].push_back(to);
1202 in_out_graph[to].push_back(from);
1224 template <
typename T>
1226 Graph<T>::find_minimum_match(IntVector& mc64RowPerm,
1227 DoubleVector& mc64RowScale,
1228 DoubleVector& mc64ColScale)
1231 mc64RowPerm.resize(m_n, 0);
1232 mc64RowScale.resize(m_n + 1, 0);
1233 mc64ColScale.resize(m_n + 1, 0);
1236 IntVector row_ptr(m_n + 1, 0);
1237 IntVector rows(m_nnz);
1238 IntVector rev_match_nodes(m_nnz);
1239 DoubleVector c_val(m_nnz);
1240 DoubleVector max_val_in_col(m_n + 1, 0);
1241 IntVector prev(m_n + 1);
1242 IntVector matched(m_n + 1, 0);
1243 IntVector rev_matched(m_n + 1, 0);
1245 get_csc_matrix(row_ptr, rows, c_val, max_val_in_col);
1246 init_reduced_cval(row_ptr, rows, c_val, mc64RowScale, mc64ColScale, mc64RowPerm, rev_match_nodes, matched, rev_matched);
1249 for(
int i=0; i<m_n; i++) {
1250 if(rev_matched[i])
continue;
1251 bool success = find_shortest_aug_path(i, matched, rev_matched, mc64RowPerm, rev_match_nodes, row_ptr, rows, prev, mc64RowScale, mc64ColScale, c_val, irn);
1255 for (
int i=0; i<m_n; i++)
1259 IntVector unmatched_rows;
1260 IntVector unmatched_cols;
1262 for (
int i=0; i<m_n; i++)
1264 unmatched_rows.push_back(i);
1266 for (
int i=0; i<m_n; i++)
1267 if (!rev_matched[i])
1268 unmatched_cols.push_back(i);
1270 int unmatched_count = unmatched_rows.size();
1272 for (
int i=0; i<unmatched_count; i++)
1273 mc64RowPerm[unmatched_rows[i]] = unmatched_cols[i];
1277 mc64RowScale.pop_back();
1278 mc64ColScale.pop_back();
1279 max_val_in_col.pop_back();
1281 thrust::transform(mc64RowScale.begin(), mc64RowScale.end(), mc64RowScale.begin(), Exponential());
1282 thrust::transform(thrust::make_transform_iterator(mc64ColScale.begin(), Exponential()),
1283 thrust::make_transform_iterator(mc64ColScale.end(), Exponential()),
1284 max_val_in_col.begin(),
1285 mc64ColScale.begin(),
1286 thrust::divides<double>());
1296 template<
typename T>
1298 Graph<T>::get_csc_matrix(IntVector& row_ptr,
1300 DoubleVector& c_val,
1301 DoubleVector& max_val_in_col)
1303 BoolVector row_visited(m_n,
false);
1305 cusp::blas::fill(c_val, LOC_INFINITY);
1306 for (EdgeIterator edgeIt = m_edges.begin(); edgeIt != m_edges.end(); edgeIt++) {
1307 row_ptr[edgeIt->m_to]++;
1308 row_visited[edgeIt->m_from] =
true;
1311 thrust::exclusive_scan(row_ptr.begin(), row_ptr.end(), row_ptr.begin());
1313 for (EdgeIterator edgeIt = m_edges.begin(); edgeIt != m_edges.end(); edgeIt++) {
1314 int from = edgeIt->m_from;
1315 int to = edgeIt->m_to;
1316 double tmp_val = fabs(edgeIt->m_val);
1317 rows[row_ptr[to]++] = from;
1318 if (max_val_in_col[to] < tmp_val)
1319 max_val_in_col[to] = tmp_val;
1322 for (EdgeRevIterator edgeIt = m_edges.rbegin(); edgeIt != m_edges.rend(); edgeIt++) {
1323 int to = edgeIt -> m_to;
1324 c_val[--row_ptr[to]] = log(max_val_in_col[to] / fabs(edgeIt -> m_val));
1334 template <
typename T>
1336 Graph<T>::init_reduced_cval(IntVector& row_ptr,
1338 DoubleVector& c_val,
1339 DoubleVector& u_val,
1340 DoubleVector& v_val,
1341 IntVector& match_nodes,
1342 IntVector& rev_match_nodes,
1344 IntVector& rev_matched)
1348 cusp::blas::fill(u_val, LOC_INFINITY);
1349 cusp::blas::fill(v_val, LOC_INFINITY);
1351 for(i = 0; i < m_n; i++) {
1352 int start_idx = row_ptr[i], end_idx = row_ptr[i+1];
1353 for(j = start_idx; j < end_idx; j++) {
1354 if (c_val[j] > LOC_INFINITY / 2.0)
continue;
1356 if(u_val[row] > c_val[j]) {
1357 u_val[row] = c_val[j];
1362 for(i = 0; i < m_n; i++) {
1363 int start_idx = row_ptr[i], end_idx = row_ptr[i+1];
1365 for(j = start_idx; j < end_idx; j++) {
1366 if (c_val[j] > LOC_INFINITY / 2.0)
continue;
1368 double tmp_val = c_val[j] - u_val[row];
1369 if(v_val[i] > tmp_val) {
1375 int tmp_row = rows[min_idx];
1376 if(!matched[tmp_row]) {
1377 rev_matched[i] =
true;
1378 matched[tmp_row] =
true;
1379 match_nodes[tmp_row] = i;
1380 rev_match_nodes[i] = min_idx;
1385 for(i = 0; i < m_n; i++) {
1388 if (!rev_matched[i])
1399 template<
typename T>
1401 Graph<T>::find_shortest_aug_path(
int init_node,
1403 IntVector& rev_matched,
1404 IntVector& match_nodes,
1405 IntVector& rev_match_nodes,
1409 DoubleVector& u_val,
1410 DoubleVector& v_val,
1411 DoubleVector& c_val,
1414 bool success =
false;
1416 static IntVector B(m_n+1, 0);
1418 static BoolVector inB(m_n+1,
false);
1420 std::priority_queue<Dijkstra> Q;
1422 double lsap = LOC_INFINITY;
1423 int cur_node = init_node;
1429 prev[init_node] = -1;
1431 static DoubleVector d_vals(m_n+1, LOC_INFINITY);
1432 static BoolVector visited(m_n+1,
false);
1435 int start_cur = row_ptr[cur_node];
1436 int end_cur = row_ptr[cur_node+1];
1437 for(i = start_cur; i < end_cur; i++) {
1438 int cur_row = rows[i];
1439 if(inB[cur_row])
continue;
1440 if(c_val[i] > LOC_INFINITY / 2.0)
continue;
1441 double reduced_cval = c_val[i] - u_val[cur_row] - v_val[cur_node];
1442 if (reduced_cval + 1e-10 < 0)
1444 double d_new = lsp + reduced_cval;
1446 if(!matched[cur_row]) {
1451 match_nodes[isap] = cur_node;
1452 }
else if (d_new < d_vals[cur_row]){
1453 d_vals[cur_row] = d_new;
1454 prev[match_nodes[cur_row]] = cur_node;
1455 Q.push(Dijkstra(cur_row, d_new));
1467 if(visited[min_d.m_idx])
1475 int tmp_idx = min_d.m_idx;
1476 visited[tmp_idx] =
true;
1480 visited[tmp_idx] =
false;
1481 d_vals[tmp_idx] = LOC_INFINITY;
1484 inB[tmp_idx] =
true;
1485 B[b_cnt++] = tmp_idx;
1487 cur_node = match_nodes[tmp_idx];
1490 if(lsap < LOC_INFINITY / 2.0f) {
1491 matched[isap] =
true;
1492 cur_node = match_nodes[isap];
1494 v_val[cur_node] = c_val[ksap];
1496 while(prev[cur_node] >= 0) {
1497 match_nodes[isap] = cur_node;
1499 int next_ksap = rev_match_nodes[cur_node];
1500 int next_isap = rows[next_ksap];
1501 next_ksap = irn[next_isap];
1503 rev_match_nodes[cur_node] = ksap;
1505 cur_node = prev[cur_node];
1509 match_nodes[isap] = cur_node;
1510 rev_match_nodes[cur_node] = ksap;
1511 rev_matched[cur_node] =
true;
1514 for (i = 0; i < b_cnt; i++) {
1516 int j_val = match_nodes[tmp_row];
1517 int tmp_k = rev_match_nodes[j_val];
1518 u_val[tmp_row] += d_vals[tmp_row] - lsap;
1519 v_val[j_val] = c_val[tmp_k] - u_val[tmp_row];
1520 d_vals[tmp_row] = LOC_INFINITY;
1521 visited[tmp_row] =
false;
1522 inB[tmp_row] =
false;
1526 Dijkstra tmpD = Q.top();
1528 d_vals[tmpD.m_idx] = LOC_INFINITY;
EdgeT< T > EdgeType
Definition: graph.h:102
EdgeT()
Definition: graph.h:71
cusp::array1d< T, cusp::host_memory > Vector
Definition: graph.h:94
void assembleBandedMatrix(int bandwidth, IntVector &ks_col, IntVector &ks_row, Vector &B, MatrixMap &typeMap, MatrixMap &bandedMatMap)
Definition: graph.h:679
int m_ori_idx
Definition: graph.h:85
int m_from
Definition: graph.h:81
EdgeVector::reverse_iterator EdgeRevIterator
Definition: graph.h:107
int m_to
Definition: graph.h:82
int m_idx
Definition: graph.h:62
friend bool operator>(const Node &a, const Node &b)
Definition: graph.h:54
int m_degree
Definition: graph.h:63
IntVector MatrixMap
Definition: graph.h:99
std::vector< EdgeType > EdgeVector
Definition: graph.h:104
double m_val
Definition: graph.h:45
Dijkstra()
Definition: graph.h:33
virtual void Start()
Definition: timer.h:108
CPU and GPU timer classes.
double getTimeMC64() const
Definition: graph.h:111
virtual void Stop()
Definition: timer.h:112
Definition of the Spike system_error exception class.
EdgeVector::iterator EdgeIterator
Definition: graph.h:106
double getTimeRCM() const
Definition: graph.h:112
Vector MatrixMapF
Definition: graph.h:98
EdgeT(int from, int to, T val)
Definition: graph.h:72
friend bool operator<(const Dijkstra &a, const Dijkstra &b)
Definition: graph.h:36
void assembleOffDiagMatrices(int bandwidth, int numPartitions, Vector &WV_host, Vector &offDiags_host, IntVector &offDiagWidths_left, IntVector &offDiagWidths_right, IntVector &offDiagPerms_left, IntVector &offDiagPerms_right, MatrixMap &typeMap, MatrixMap &offDiagMap, MatrixMap &WVMap)
Definition: graph.h:476
cusp::array1d< bool, cusp::host_memory > BoolVector
Definition: graph.h:97
int dropOff(T frac, int maxBandwidth, T &frac_actual)
Definition: graph.h:408
int m_idx
Definition: graph.h:44
friend bool operator<(const Node &a, const Node &b)
Definition: graph.h:58
Graph(bool trackReordering=false)
Definition: graph.h:306
Node(int idx, int degree)
Definition: graph.h:52
cusp::coo_matrix< int, T, cusp::host_memory > MatrixCoo
Definition: graph.h:93
friend bool operator>(const Dijkstra &a, const Dijkstra &b)
Definition: graph.h:40
EdgeT(int ori_idx, int from, int to, T val)
Definition: graph.h:74
int reorder(const MatrixCoo &Acoo, bool doMC64, bool scale, IntVector &optReordering, IntVector &optPerm, IntVector &mc64RowPerm, Vector &mc64RowScale, Vector &mc64ColScale, MatrixMapF &scaleMap, int &k_mc64)
Definition: graph.h:324
friend bool operator<(const EdgeT &a, const EdgeT &b)
Definition: graph.h:77
cusp::array1d< int, cusp::host_memory > IntVector
Definition: graph.h:96
void secondLevelReordering(int bandwidth, int numPartitions, IntVector &secondReorder, IntVector &secondPerm, IntVector &first_rows)
Definition: graph.h:606
cusp::array1d< double, cusp::host_memory > DoubleVector
Definition: graph.h:95
Definition: exception.h:22
virtual double getElapsed()
Definition: timer.h:116
double getTimeDropoff() const
Definition: graph.h:113
Dijkstra(int idx, double val)
Definition: graph.h:34
std::vector< NodeType > NodeVector
Definition: graph.h:103
CPU timer.
Definition: timer.h:101
Node NodeType
Definition: graph.h:101
Definition: exception.h:19
T m_val
Definition: graph.h:83