SpikeGPU  1.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
graph.h
Go to the documentation of this file.
1 #ifndef SPIKE_GRAPH_H
2 #define SPIKE_GRAPH_H
3 
4 #include <vector>
5 #include <queue>
6 #include <cmath>
7 #include <iostream>
8 #include <cstdio>
9 #include <algorithm>
10 
11 #include <cusp/csr_matrix.h>
12 #include <cusp/coo_matrix.h>
13 #include <cusp/blas.h>
14 #include <cusp/print.h>
15 
16 #include <thrust/scan.h>
17 #include <thrust/functional.h>
18 #include <thrust/sequence.h>
19 #include <thrust/iterator/zip_iterator.h>
20 #include <thrust/gather.h>
21 
22 #include <spike/common.h>
23 #include <spike/timer.h>
24 
25 #include <spike/exception.h>
26 
27 namespace spike {
28 
29 
30 class Dijkstra
31 {
32 public:
33  Dijkstra() {}
34  Dijkstra(int idx, double val) : m_idx(idx), m_val(val) {}
35 
36  friend bool operator<(const Dijkstra& a, const Dijkstra& b) {
37  return a.m_val > b.m_val;
38  }
39 
40  friend bool operator>(const Dijkstra& a, const Dijkstra& b) {
41  return a.m_val < b.m_val;
42  }
43 
44  int m_idx;
45  double m_val;
46 };
47 
48 
49 class Node
50 {
51 public:
52  Node(int idx, int degree) : m_idx(idx), m_degree(degree) {}
53 
54  friend bool operator>(const Node& a, const Node& b) {
55  return a.m_degree < b.m_degree;
56  }
57 
58  friend bool operator<(const Node& a, const Node& b) {
59  return a.m_degree > b.m_degree;
60  }
61 
62  int m_idx;
63  int m_degree;
64 };
65 
66 
67 template <typename T>
68 class EdgeT
69 {
70 public:
71  EdgeT() {}
72  EdgeT(int from, int to, T val) : m_from(from), m_to(to), m_val(val) {}
73 
74  EdgeT(int ori_idx, int from, int to, T val)
75  : m_ori_idx(ori_idx), m_from(from), m_to(to), m_val(val) {}
76 
77  friend bool operator< (const EdgeT& a, const EdgeT& b) {
78  return a.m_to < b.m_to;
79  }
80 
81  int m_from;
82  int m_to;
83  T m_val;
84 
85  int m_ori_idx;
86 };
87 
88 
89 template <typename T>
90 class Graph
91 {
92 public:
93  typedef typename cusp::coo_matrix<int, T, cusp::host_memory> MatrixCoo;
94  typedef typename cusp::array1d<T, cusp::host_memory> Vector;
95  typedef typename cusp::array1d<double, cusp::host_memory> DoubleVector;
96  typedef typename cusp::array1d<int, cusp::host_memory> IntVector;
97  typedef typename cusp::array1d<bool, cusp::host_memory> BoolVector;
98  typedef Vector MatrixMapF;
100 
101  typedef Node NodeType;
103  typedef std::vector<NodeType> NodeVector;
104  typedef std::vector<EdgeType> EdgeVector;
105 
106  typedef typename EdgeVector::iterator EdgeIterator;
107  typedef typename EdgeVector::reverse_iterator EdgeRevIterator;
108 
109  Graph(bool trackReordering = false);
110 
111  double getTimeMC64() const {return m_timeMC64;}
112  double getTimeRCM() const {return m_timeRCM;}
113  double getTimeDropoff() const {return m_timeDropoff;}
114 
115  int reorder(const MatrixCoo& Acoo,
116  bool doMC64,
117  bool scale,
118  IntVector& optReordering,
119  IntVector& optPerm,
120  IntVector& mc64RowPerm,
121  Vector& mc64RowScale,
122  Vector& mc64ColScale,
123  MatrixMapF& scaleMap,
124  int& k_mc64);
125 
126  int dropOff(T frac,
127  int maxBandwidth,
128  T& frac_actual);
129 
130  void assembleOffDiagMatrices(int bandwidth,
131  int numPartitions,
132  Vector& WV_host,
133  Vector& offDiags_host,
134  IntVector& offDiagWidths_left,
135  IntVector& offDiagWidths_right,
136  IntVector& offDiagPerms_left,
137  IntVector& offDiagPerms_right,
138  MatrixMap& typeMap,
139  MatrixMap& offDiagMap,
140  MatrixMap& WVMap);
141 
142  void secondLevelReordering(int bandwidth,
143  int numPartitions,
144  IntVector& secondReorder,
145  IntVector& secondPerm,
146  IntVector& first_rows);
147 
148  void assembleBandedMatrix(int bandwidth,
149  IntVector& ks_col,
150  IntVector& ks_row,
151  Vector& B,
152  MatrixMap& typeMap,
153  MatrixMap& bandedMatMap);
154 
155  void assembleBandedMatrix(int bandwidth,
156  int numPartitions,
157  IntVector& ks_col,
158  IntVector& ks_row,
159  Vector& B,
160  IntVector& ks,
161  IntVector& BOffsets,
162  MatrixMap& typeMap,
163  MatrixMap& bandedMatMap);
164 
165 private:
166  int m_n;
167  int m_nnz;
168  EdgeVector m_edges;
169  EdgeVector m_major_edges;
170 
171  EdgeIterator m_first;
172 
173  bool m_trackReordering;
174 
175  double m_timeMC64;
176  double m_timeRCM;
177  double m_timeDropoff;
178 
179  BoolVector m_exists;
180 
181  bool MC64(bool scale,
182  IntVector& mc64RowPerm,
183  DoubleVector& mc64RowScale,
184  DoubleVector& mc64ColScale,
185  MatrixMapF& scaleMap);
186 
187  int RCM(EdgeVector& edges,
188  IntVector& optReordering,
189  IntVector& optPerm);
190 
191  bool partitionedRCM(EdgeIterator& begin,
192  EdgeIterator& end,
193  int node_begin,
194  int node_end,
195  IntVector& optReordering,
196  IntVector& optPerm);
197 
198  void buildTopology(EdgeIterator& begin,
199  EdgeIterator& end,
200  IntVector& degrees,
201  std::vector<int>* in_out_graph);
202 
203  static const double LOC_INFINITY;
204 
205  // Functions used in MC64
206  void find_minimum_match(IntVector& mc64RowPerm,
207  DoubleVector& mc64RowScale,
208  DoubleVector& mc64ColScale);
209  void init_reduced_cval(IntVector& row_ptr,
210  IntVector& rows,
211  DoubleVector& c_val,
212  DoubleVector& u_val,
213  DoubleVector& v_val,
214  IntVector& match_nodes,
215  IntVector& rev_match_nodes,
216  IntVector& matched,
217  IntVector& rev_matched);
218  bool find_shortest_aug_path(int init_node,
219  IntVector& matched, IntVector& rev_matched,
220  IntVector& match_nodes, IntVector& rev_match_nodes,
221  IntVector& row_ptr, IntVector& rows, IntVector& prev,
222  DoubleVector& u_val,
223  DoubleVector& v_val,
224  DoubleVector& c_val,
225  IntVector& irn);
226  void get_csc_matrix(IntVector& row_ptr,
227  IntVector& rows,
228  DoubleVector& c_val,
229  DoubleVector& max_val_in_col);
230 
231  // Functor objects.
232  struct CompareEdgeLength
233  {
234  bool operator()(const EdgeT<T>& a, const EdgeT<T>& b)
235  {
236  return abs(a.m_from - a.m_to) > abs(b.m_from - b.m_to);
237  }
238  };
239 
240  struct AccumulateEdgeValue
241  {
242  T operator()(T res, const EdgeT<T>& edge)
243  {
244  return res + std::abs(edge.m_val);
245  }
246  };
247 
248  struct EdgeLength : public thrust::unary_function<EdgeT<T>, int>
249  {
250  __host__ __device__
251  int operator() (const EdgeT<T>& a)
252  {
253  return std::abs(a.m_from - a.m_to);
254  }
255  };
256 
257  struct PermutedEdgeLength : public thrust::unary_function<EdgeT<T>, int>
258  {
259  int* m_perm;
260 
261  PermutedEdgeLength(int* perm): m_perm(perm) {}
262 
263  __host__ __device__
264  int operator() (const EdgeT<T>& a)
265  {
266  return std::abs(m_perm[a.m_from] - m_perm[a.m_to]);
267  }
268  };
269 
270  struct PermuteEdge
271  {
272  int* m_perm;
273 
274  PermuteEdge(int* perm): m_perm(perm) {}
275 
276  __host__ __device__
277  void operator() (EdgeT<T>& a)
278  {
279  a.m_from = m_perm[a.m_from];
280  a.m_to = m_perm[a.m_to];
281  }
282  };
283 
284  struct Exponential: public thrust::unary_function<double, double>
285  {
286  __host__
287  double operator() (double a)
288  {
289  return exp(a);
290  }
291  };
292 
293 };
294 
295 
296 template <typename T>
297 const double Graph<T>::LOC_INFINITY = 1e37;
298 
299 
300 // ----------------------------------------------------------------------------
301 // Graph::Graph()
302 //
303 // This is the constructor for the Graph class.
304 // ----------------------------------------------------------------------------
305 template <typename T>
306 Graph<T>::Graph(bool trackReordering)
307 : m_timeMC64(0),
308  m_timeRCM(0),
309  m_timeDropoff(0),
310  m_trackReordering(trackReordering)
311 {
312 }
313 
314 
315 // ----------------------------------------------------------------------------
316 // Graph::reorder()
317 //
318 // This function applies various reordering algorithms to the specified matrix
319 // (assumed to be in COO format and on the host) for bandwidth reduction and
320 // diagonal boosting. It returns the half-bandwidth after reordering.
321 // ----------------------------------------------------------------------------
322 template <typename T>
323 int
325  bool doMC64,
326  bool scale,
327  IntVector& optReordering,
328  IntVector& optPerm,
329  IntVector& mc64RowPerm,
330  Vector& mc64RowScale,
331  Vector& mc64ColScale,
332  MatrixMapF& scaleMap,
333  int& k_mc64)
334 {
335  m_n = Acoo.num_rows;
336  m_nnz = Acoo.num_entries;
337 
338  // Create the edges in the graph.
339  if (m_trackReordering) {
340  for (int i = 0; i < m_nnz; i++) {
341  m_edges.push_back(EdgeType(i, Acoo.row_indices[i], Acoo.column_indices[i], (T)Acoo.values[i]));
342  }
343  } else {
344  for (int i = 0; i < m_nnz; i++)
345  m_edges.push_back(EdgeType(Acoo.row_indices[i], Acoo.column_indices[i], (T)Acoo.values[i]));
346  }
347 
348  // Apply mc64 algorithm. Note that we must ensure we always work with
349  // double precision scale vectors.
350  //
351  // TODO: how can we check if the precision of Vector is already
352  // double, so that we can save extra copies.
353  if (doMC64) {
354  DoubleVector mc64RowScaleD;
355  DoubleVector mc64ColScaleD;
356  MC64(scale, mc64RowPerm, mc64RowScaleD, mc64ColScaleD, scaleMap);
357  mc64RowScale = mc64RowScaleD;
358  mc64ColScale = mc64ColScaleD;
359  } else {
360  mc64RowScale.resize(m_n);
361  mc64ColScale.resize(m_n);
362  mc64RowPerm.resize(m_n);
363  scaleMap.resize(m_nnz);
364 
365  thrust::sequence(mc64RowPerm.begin(), mc64RowPerm.end());
366  cusp::blas::fill(mc64RowScale, (T) 1.0);
367  cusp::blas::fill(mc64ColScale, (T) 1.0);
368  cusp::blas::fill(scaleMap, (T) 1.0);
369  }
370 
371  k_mc64 = 0;
372  for (EdgeIterator edgeIt = m_edges.begin(); edgeIt != m_edges.end(); edgeIt++)
373  if (k_mc64 < abs(edgeIt->m_from - edgeIt->m_to))
374  k_mc64 = abs(edgeIt->m_from - edgeIt->m_to);
375 
376 
377 
378  // Apply reverse Cuthill-McKee algorithm.
379  int bandwidth = RCM(m_edges, optReordering, optPerm);
380 
381  // Initialize the iterator m_first (in case dropOff() is not called).
382  m_first = m_edges.begin();
383 
384  // Return the bandwidth obtained after reordering.
385  return bandwidth;
386 }
387 
388 
389 // ----------------------------------------------------------------------------
390 // Graph::dropOff()
391 //
392 // This function identifies the elements that can be removed in the reordered
393 // matrix while reducing the element-wise 1-norm by no more than the specified
394 // fraction.
395 //
396 // We cache an iterator in the (now ordered) vector of edges, such that the
397 // edges from that point to the end encode a banded matrix whose element-wise
398 // 1-norm is at least (1-frac) ||A||_1.
399 //
400 // Note that the final bandwidth is guranteed to be no more than the specified
401 // maxBandwidth value.
402 //
403 // The return value is the number of dropped diagonals. We also return the
404 // actual norm reduction fraction achieved.
405 // ----------------------------------------------------------------------------
406 template <typename T>
407 int
409  int maxBandwidth,
410  T& frac_actual)
411 {
412  CPUTimer timer;
413  timer.Start();
414 
415  // Sort the edges in *decreasing* order of their length (the difference
416  // between the indices of their adjacent nodes).
417  std::sort(m_edges.begin(), m_edges.end(), CompareEdgeLength());
418 
419  // Calculate the 1-norm of the current matrix and the minimum norm that
420  // must be retained after drop-off. Initialize the 1-norm of the resulting
421  // truncated matrix.
422  T norm_in = std::accumulate(m_edges.begin(), m_edges.end(), (T) 0, AccumulateEdgeValue());
423  T min_norm_out = (1 - frac) * norm_in;
424  T norm_out = norm_in;
425 
426  // Walk all edges and accumulate the weigth (1-norm) of one band at a time.
427  // Continue until we are left with the main diagonal only or until the weight
428  // of all proccessed bands exceeds the allowable drop off (provided we do not
429  // exceed the specified maximum bandwidth.
430  m_first = m_edges.begin();
431  EdgeIterator last = m_first;
432  int dropped = 0;
433 
434  while (true) {
435  // Current band
436  int bandwidth = abs(m_first->m_from - m_first->m_to);
437 
438  // Stop now if we reached the main diagonal.
439  if (bandwidth == 0)
440  break;
441 
442  // Find all edges in the current band and calculate the norm of the band.
443  do {last++;} while(abs(last->m_from - last->m_to) == bandwidth);
444 
445  T band_norm = std::accumulate(m_first, last, (T) 0, AccumulateEdgeValue());
446 
447  // Stop now if we are below the specified maximum and removing this band
448  // would reduce the norm by more than allowed.
449  if (bandwidth <= maxBandwidth && norm_out - band_norm < min_norm_out)
450  break;
451 
452  // Remove the norm of this band and move to the next one.
453  norm_out -= band_norm;
454  m_first = last;
455  dropped++;
456  }
457 
458  timer.Stop();
459  m_timeDropoff = timer.getElapsed();
460 
461  // Calculate the actual norm reduction fraction.
462  frac_actual = 1 - norm_out/norm_in;
463 
464  return dropped;
465 }
466 
467 
468 // ----------------------------------------------------------------------------
469 // Graph::assembleOffDiagMatrices()
470 //
471 // This function finds all non-zeroes in off-diagonal matrices and assemble
472 // off-diagonal matrices.
473 // ----------------------------------------------------------------------------
474 template <typename T>
475 void
477  int numPartitions,
478  Vector& WV_host,
479  Vector& offDiags_host,
480  IntVector& offDiagWidths_left,
481  IntVector& offDiagWidths_right,
482  IntVector& offDiagPerms_left,
483  IntVector& offDiagPerms_right,
484  MatrixMap& typeMap,
485  MatrixMap& offDiagMap,
486  MatrixMap& WVMap)
487 {
488  if (WV_host.size() != 2*bandwidth*bandwidth*(numPartitions-1)) {
489  WV_host.resize(2*bandwidth*bandwidth*(numPartitions-1), 0);
490  offDiags_host.resize(2*bandwidth*bandwidth*(numPartitions-1), 0);
491  offDiagWidths_left.resize(numPartitions-1, 0);
492  offDiagWidths_right.resize(numPartitions-1, 0);
493 
494  } else {
495  cusp::blas::fill(WV_host, (T) 0);
496  cusp::blas::fill(offDiags_host, (T) 0);
497  cusp::blas::fill(offDiagWidths_left, 0);
498  cusp::blas::fill(offDiagWidths_right, 0);
499  }
500 
501  offDiagPerms_left.resize((numPartitions-1) * bandwidth, -1);
502  offDiagPerms_right.resize((numPartitions-1) * bandwidth, -1);
503 
504  IntVector offDiagReorderings_left((numPartitions-1) * bandwidth, -1);
505  IntVector offDiagReorderings_right((numPartitions-1) * bandwidth, -1);
506 
507  EdgeIterator first = m_first;
508 
509  int partSize = m_n / numPartitions;
510  int remainder = m_n % numPartitions;
511 
512  m_major_edges.clear();
513 
514  if (m_trackReordering) {
515  typeMap.resize(m_nnz);
516  offDiagMap.resize(m_nnz);
517  WVMap.resize(m_nnz);
518  }
519 
520  m_exists.resize(m_n);
521  cusp::blas::fill(m_exists, false);
522 
523  for (EdgeIterator it = first; it != m_edges.end(); ++it) {
524  int j = it->m_from;
525  int l = it->m_to;
526  int curPartNum = l / (partSize + 1);
527  if (curPartNum >= remainder)
528  curPartNum = remainder + (l-remainder * (partSize + 1)) / partSize;
529 
530  int curPartNum2 = j / (partSize + 1);
531  if (curPartNum2 >= remainder)
532  curPartNum2 = remainder + (j-remainder * (partSize + 1)) / partSize;
533 
534  if (curPartNum == curPartNum2)
535  m_major_edges.push_back(*it);
536  else {
537  if (curPartNum > curPartNum2) { // V/B Matrix
538  m_exists[j] = true;
539  int partEndRow = partSize * curPartNum2;
540  if (curPartNum2 < remainder)
541  partEndRow += curPartNum2 + partSize + 1;
542  else
543  partEndRow += remainder + partSize;
544 
545  int partStartCol = partSize * curPartNum;
546  if (curPartNum < remainder)
547  partStartCol += curPartNum;
548  else
549  partStartCol += remainder;
550 
551  if (offDiagReorderings_right[curPartNum2*bandwidth+l-partStartCol] < 0) {
552  offDiagReorderings_right[curPartNum2*bandwidth+l-partStartCol] = offDiagWidths_right[curPartNum2];
553  offDiagPerms_right[curPartNum2*bandwidth+offDiagWidths_right[curPartNum2]] = l-partStartCol;
554  offDiagWidths_right[curPartNum2]++;
555  }
556 
557  if (m_trackReordering) {
558  typeMap[it->m_ori_idx] = 0;
559  offDiagMap[it->m_ori_idx] = curPartNum2*2*bandwidth*bandwidth + (j+bandwidth-partEndRow) * bandwidth + (l-partStartCol);
560  WVMap[it->m_ori_idx] = curPartNum2*2*bandwidth*bandwidth + (j+bandwidth-partEndRow) + offDiagReorderings_right[curPartNum2*bandwidth+l-partStartCol] * bandwidth;
561  }
562 
563  offDiags_host[curPartNum2*2*bandwidth*bandwidth + (j+bandwidth-partEndRow) * bandwidth + (l-partStartCol)] = WV_host[curPartNum2*2*bandwidth*bandwidth + (j+bandwidth-partEndRow) + offDiagReorderings_right[curPartNum2*bandwidth+l-partStartCol] * bandwidth] = it->m_val;
564 
565  } else { // W/C Matrix
566  int partStartRow = partSize * curPartNum2;
567  if (curPartNum2 < remainder)
568  partStartRow += curPartNum2;
569  else
570  partStartRow += remainder;
571 
572  int partEndCol = partSize * curPartNum;
573  if (curPartNum < remainder)
574  partEndCol += curPartNum + partSize + 1;
575  else
576  partEndCol += remainder + partSize;
577 
578  if (offDiagReorderings_left[(curPartNum2-1)*bandwidth+l-partEndCol+bandwidth] < 0) {
579  offDiagReorderings_left[(curPartNum2-1)*bandwidth+l-partEndCol+bandwidth] = bandwidth - 1 - offDiagWidths_left[curPartNum2-1];
580  offDiagPerms_left[(curPartNum2-1)*bandwidth+bandwidth-1-offDiagWidths_left[curPartNum2-1]] = l-partEndCol+bandwidth;
581  offDiagWidths_left[curPartNum2-1]++;
582  }
583 
584  if (m_trackReordering) {
585  typeMap[it->m_ori_idx] = 0;
586  offDiagMap[it->m_ori_idx] = (curPartNum*2+1)*bandwidth*bandwidth + (j-partStartRow) * bandwidth + (l-partEndCol+bandwidth);
587  WVMap[it->m_ori_idx] = (curPartNum*2+1)*bandwidth*bandwidth + (j-partStartRow) + (offDiagReorderings_left[(curPartNum2-1)*bandwidth+l-partEndCol+bandwidth]) * bandwidth;
588  }
589 
590  offDiags_host[(curPartNum*2+1)*bandwidth*bandwidth + (j-partStartRow) * bandwidth + (l-partEndCol+bandwidth)] = WV_host[(curPartNum*2+1)*bandwidth*bandwidth + (j-partStartRow) + (offDiagReorderings_left[(curPartNum2-1)*bandwidth+l-partEndCol+bandwidth]) * bandwidth] = it->m_val;
591  }
592  }
593  }
594 }
595 
596 
597 // ----------------------------------------------------------------------------
598 // Graph::secondLevelReordering()
599 //
600 // This function applies the second-stage reordering if the flags ``m_reorder''
601 // and ``m_secondLevelReordering'' are both true. Second-level reordering is
602 // essentially RCM on all partitions.
603 // ----------------------------------------------------------------------------
604 template <typename T>
605 void
607  int numPartitions,
608  IntVector& secondReorder,
609  IntVector& secondPerm,
610  IntVector& first_rows)
611 {
612  {
613  secondPerm.resize(m_n);
614  cusp::blas::fill(secondPerm, 0);
615  for (EdgeIterator edgeIt = m_major_edges.begin(); edgeIt != m_major_edges.end(); edgeIt++)
616  secondPerm[edgeIt -> m_to]++;
617  thrust::inclusive_scan(secondPerm.begin(), secondPerm.end(), secondPerm.begin());
618  EdgeVector tmp_edges(m_major_edges.size());
619  for (EdgeRevIterator edgeIt = m_major_edges.rbegin(); edgeIt != m_major_edges.rend(); edgeIt++)
620  tmp_edges[--secondPerm[edgeIt->m_to]] = *edgeIt;
621  m_major_edges = tmp_edges;
622  }
623 
624  int node_begin = 0, node_end;
625  int partSize = m_n / numPartitions;
626  int remainder = m_n % numPartitions;
627  EdgeIterator edgeBegin = m_major_edges.begin(), edgeEnd;
628  secondReorder.resize(m_n);
629 
630  for (int i = 0; i < numPartitions; i++) {
631  if (i < remainder)
632  node_end = node_begin + partSize + 1;
633  else
634  node_end = node_begin + partSize;
635 
636  for (edgeEnd = edgeBegin; edgeEnd != m_major_edges.end(); edgeEnd++) {
637  if (edgeEnd->m_from >= node_end)
638  break;
639  }
640 
641  partitionedRCM(edgeBegin,
642  edgeEnd,
643  node_begin,
644  node_end,
645  secondReorder,
646  secondPerm);
647 
648  node_begin = node_end;
649  edgeBegin = edgeEnd;
650  }
651 
652  first_rows.resize(numPartitions - 1);
653 
654  int last_row = 0;
655  for (int i=0; i<numPartitions - 1; i++) {
656  if (i < remainder)
657  last_row += (partSize + 1);
658  else
659  last_row += partSize;
660 
661  first_rows[i] = last_row;
662  for (int j = last_row-bandwidth; j < last_row; j++)
663  if (m_exists[j] && first_rows[i] > secondPerm[j])
664  first_rows[i] = secondPerm[j];
665  }
666 }
667 
668 
669 // ----------------------------------------------------------------------------
670 // Graph::assembleBandedMatrix()
671 //
672 // These functions assemble the Spike banded matrix using current information
673 // in the graph. The first version creates a banded matrix of constant
674 // bandwidth across all partitions. The second version assmebles a matrix
675 // that has banded diagonal blocks of different bandwidths for each partition.
676 // ----------------------------------------------------------------------------
677 template <typename T>
678 void
680  IntVector& ks_col,
681  IntVector& ks_row,
682  Vector& B,
683  MatrixMap& typeMap,
684  MatrixMap& bandedMatMap)
685 {
686  // Drop all edges from begin() to 'first'; i.e., keep all edges from
687  // 'first' to end().
688  size_t Bsize = (size_t) (2 * bandwidth + 1) * m_n;
689  B.resize(Bsize);
690 
691  ks_col.resize(m_n, 0);
692  ks_row.resize(m_n, 0);
693 
694  if (m_trackReordering) {
695  if (typeMap.size() <= 0)
696  typeMap.resize(m_nnz);
697  bandedMatMap.resize(m_nnz);
698  }
699 
700  if (m_trackReordering) {
701  for (EdgeIterator it = m_first; it != m_edges.end(); ++it) {
702  int j = it->m_from;
703  int l = it->m_to;
704 
705  size_t i = (size_t) l * (2 * bandwidth + 1) + bandwidth + j - l;
706  B[i] = it->m_val;
707  typeMap[it->m_ori_idx] = 1;
708  bandedMatMap[it->m_ori_idx] = i;
709 
710  if (ks_col[l] < j - l)
711  ks_col[l] = j - l;
712  if (ks_row[j] < l-j)
713  ks_row[j] = l-j;
714  }
715  } else {
716  for (EdgeIterator it = m_first; it != m_edges.end(); ++it) {
717  int j = it->m_from;
718  int l = it->m_to;
719 
720  size_t i = (size_t) l * (2 * bandwidth + 1) + bandwidth + j - l;
721  B[i] = it->m_val;
722 
723  if (ks_col[l] < j - l)
724  ks_col[l] = j - l;
725  if (ks_row[j] < l-j)
726  ks_row[j] = l-j;
727  }
728  }
729 
730  for (int i=1; i<m_n; i++) {
731  if (ks_col[i] < ks_col[i-1] - 1)
732  ks_col[i] = ks_col[i-1] - 1;
733  if (ks_row[i] < ks_row[i-1] - 1)
734  ks_row[i] = ks_row[i-1] - 1;
735  }
736 }
737 
738 template <typename T>
739 void
741  int numPartitions,
742  IntVector& ks_col,
743  IntVector& ks_row,
744  Vector& B,
745  IntVector& ks,
746  IntVector& BOffsets,
747  MatrixMap& typeMap,
748  MatrixMap& bandedMatMap)
749 {
750  ks.resize(numPartitions, 0);
751  BOffsets.resize(numPartitions + 1);
752 
753  BOffsets[0] = 0;
754 
755  int partSize = m_n / numPartitions;
756  int remainder = m_n % numPartitions;
757 
758  EdgeIterator toStart = m_major_edges.begin(), toEnd = m_major_edges.end();
759 
760  for (EdgeIterator it = toStart; it != toEnd; ++it) {
761  int j = it->m_from;
762  int l = it->m_to;
763  int curPartNum = l / (partSize + 1);
764  if (curPartNum >= remainder)
765  curPartNum = remainder + (l-remainder * (partSize + 1)) / partSize;
766  if (ks[curPartNum] < abs(l-j))
767  ks[curPartNum] = abs(l-j);
768  }
769 
770  for (int i=0; i < numPartitions; i++) {
771  if (i < remainder)
772  BOffsets[i+1] = BOffsets[i] + (partSize + 1) * (2 * ks[i] + 1);
773  else
774  BOffsets[i+1] = BOffsets[i] + (partSize) * (2 * ks[i] + 1);
775  }
776 
777  B.resize(BOffsets[numPartitions]);
778 
779  if (m_trackReordering) {
780  if (typeMap.size() <= 0)
781  typeMap.resize(m_nnz);
782  bandedMatMap.resize(m_nnz);
783  }
784 
785  ks_col.resize(m_n, 0);
786  ks_row.resize(m_n, 0);
787 
788  for (EdgeIterator it = toStart; it != toEnd; ++it) {
789  int j = it->m_from;
790  int l = it->m_to;
791 
792  int curPartNum = l / (partSize + 1);
793  int l_in_part;
794  if (curPartNum >= remainder) {
795  l_in_part = l - remainder * (partSize + 1);
796  curPartNum = remainder + l_in_part / partSize;
797  l_in_part %= partSize;
798  } else {
799  l_in_part = l % (partSize + 1);
800  }
801 
802  int K = ks[curPartNum];
803  int i = BOffsets[curPartNum] + l_in_part * (2 * K + 1) + K + j - l;
804  B[i] = it->m_val;
805 
806  if (ks_col[l] < j - l)
807  ks_col[l] = j - l;
808  if (ks_row[j] < l-j)
809  ks_row[j] = l-j;
810 
811  if (m_trackReordering) {
812  typeMap[it->m_ori_idx] = 1;
813  bandedMatMap[it->m_ori_idx] = i;
814  }
815  }
816 
817  int partBegin = 0, partEnd = partSize;
818  for (int i=0; i<numPartitions; i++) {
819  if (i < remainder)
820  partEnd++;
821  for (int j = partBegin+1; j < partEnd; j++) {
822  if (ks_col[j] < ks_col[j-1] - 1)
823  ks_col[j] = ks_col[j-1] - 1;
824  if (ks_col[j] > partEnd - j - 1)
825  ks_col[j] = partEnd - j - 1;
826 
827  if (ks_row[j] < ks_row[j-1] - 1)
828  ks_row[j] = ks_row[j-1] - 1;
829  if (ks_row[j] > partEnd - j - 1)
830  ks_row[j] = partEnd - j - 1;
831  }
832  partBegin = partEnd;
833  partEnd = partBegin + partSize;
834  }
835 }
836 
837 
838 // ----------------------------------------------------------------------------
839 // Graph::MC64()
840 //
841 // This function performs the mc64 reordering algorithm...
842 // ----------------------------------------------------------------------------
843 template <typename T>
844 bool
845 Graph<T>::MC64(bool scale,
846  IntVector& mc64RowPerm,
847  DoubleVector& mc64RowScale,
848  DoubleVector& mc64ColScale,
849  MatrixMapF& scaleMap)
850 {
851  find_minimum_match(mc64RowPerm, mc64RowScale, mc64ColScale);
852 
853  if (m_trackReordering)
854  scaleMap.resize(m_nnz);
855 
856  if (scale) {
857  for (EdgeIterator iter = m_edges.begin(); iter != m_edges.end(); iter++) {
858  int from = iter->m_from;
859  int to = iter->m_to;
860  T scaleFact = (T)(mc64RowScale[from] * mc64ColScale[to]);
861  if (m_trackReordering)
862  scaleMap[iter->m_ori_idx] = scaleFact;
863  iter->m_val *= scaleFact;
864  iter->m_from = mc64RowPerm[from];
865  }
866  } else {
867  for(EdgeIterator iter = m_edges.begin(); iter != m_edges.end(); iter++)
868  iter->m_from = mc64RowPerm[iter->m_from];
869 
870  if (m_trackReordering)
871  cusp::blas::fill(scaleMap, (T) 1.0);
872  }
873 
874  return true;
875 }
876 
877 
878 // ----------------------------------------------------------------------------
879 // Graph::RCM()
880 //
881 // This function implements the Reverse Cuthill-McKee algorithm...
882 // The return value is the obtained bandwidth. A value of -1 is returned if
883 // the algorithm fails.
884 // ----------------------------------------------------------------------------
885 template <typename T>
886 int
887 Graph<T>::RCM(EdgeVector& edges,
888  IntVector& optReordering,
889  IntVector& optPerm)
890 {
891  optReordering.resize(m_n);
892  optPerm.resize(m_n);
893 
894  int nnz = edges.size();
895 
896  IntVector tmp_reordering(m_n);
897  IntVector degrees(m_n, 0);
898 
899  thrust::sequence(optReordering.begin(), optReordering.end());
900 
901  std::vector<int> *in_out_graph;
902 
903  in_out_graph = new std::vector<int> [m_n];
904 
905  EdgeIterator begin = edges.begin();
906  EdgeIterator end = edges.end();
907  int tmp_bdwidth = thrust::transform_reduce(begin, end, EdgeLength(), 0, thrust::maximum<int>());
908  int bandwidth = tmp_bdwidth;
909  buildTopology(begin, end, degrees, in_out_graph);
910 
911  const int MAX_NUM_TRIAL = 10;
912  const int BANDWIDTH_THRESHOLD = 256;
913  const int BANDWIDTH_MIN_REQUIRED = 10000;
914 
915  CPUTimer timer;
916  timer.Start();
917 
918  BoolVector tried(m_n, false);
919  tried[0] = true;
920 
921  int last_tried = 0;
922 
923  for (int trial_num = 0; trial_num < MAX_NUM_TRIAL || (bandwidth >= BANDWIDTH_MIN_REQUIRED && trial_num < 10*MAX_NUM_TRIAL); trial_num++)
924  {
925  std::queue<int> q;
926  std::priority_queue<NodeType> pq;
927 
928  int tmp_node;
929  BoolVector pushed(m_n, false);
930 
931  int left_cnt = m_n;
932  int j = 0, last = 0;
933 
934  if (trial_num > 0) {
935 
936  if (trial_num < MAX_NUM_TRIAL) {
937  tmp_node = rand() % m_n;
938 
939  while(tried[tmp_node])
940  tmp_node = rand() % m_n;
941  } else {
942  if (last_tried >= m_n - 1) {
943  fprintf(stderr, "All possible starting points have been tried in RCM\n");
944  break;
945  }
946  for (tmp_node = last_tried+1; tmp_node < m_n; tmp_node++)
947  if (!tried[tmp_node]) {
948  last_tried = tmp_node;
949  break;
950  }
951  }
952 
953  pushed[tmp_node] = true;
954  tried[tmp_node] = true;
955  q.push(tmp_node);
956  }
957 
958  while(left_cnt--) {
959  if(q.empty()) {
960  left_cnt++;
961  int i;
962 
963  for(i = last; i < m_n; i++) {
964  if(!pushed[i]) {
965  q.push(i);
966  pushed[i] = true;
967  last = i;
968  break;
969  }
970  }
971  if(i < m_n) continue;
972  fprintf(stderr, "Can never get here!\n");
973  return -1;
974  }
975 
976  tmp_node = q.front();
977  tmp_reordering[j] = tmp_node;
978  j++;
979 
980  q.pop();
981 
982  std::vector<int> &tmp_vec = in_out_graph[tmp_node];
983  int out_size = tmp_vec.size();
984  if(out_size != 0) {
985  for (int i = 0; i < out_size; i++) {
986  int target_node = tmp_vec[i];
987  if(!pushed[target_node]) {
988  pushed[target_node] = true;
989  pq.push(NodeType(target_node, degrees[target_node]));
990  }
991  }
992  }
993 
994  while(!pq.empty()) {
995  q.push(pq.top().m_idx);
996  pq.pop();
997  }
998  }
999 
1000  thrust::scatter(thrust::make_counting_iterator(0),
1001  thrust::make_counting_iterator(m_n),
1002  tmp_reordering.begin(),
1003  optPerm.begin());
1004 
1005  {
1006  int *perm_array = thrust::raw_pointer_cast(&optPerm[0]);
1007  tmp_bdwidth = thrust::transform_reduce(edges.begin(), edges.end(), PermutedEdgeLength(perm_array), 0, thrust::maximum<int>());
1008  }
1009 
1010  if(bandwidth > tmp_bdwidth) {
1011  bandwidth = tmp_bdwidth;
1012  optReordering = tmp_reordering;
1013  }
1014 
1015  if(bandwidth <= BANDWIDTH_THRESHOLD)
1016  break;
1017  }
1018 
1019 
1020  timer.Stop();
1021  m_timeRCM = timer.getElapsed();
1022 
1023  thrust::scatter(thrust::make_counting_iterator(0),
1024  thrust::make_counting_iterator(m_n),
1025  optReordering.begin(),
1026  optPerm.begin());
1027 
1028  {
1029  int* perm_array = thrust::raw_pointer_cast(&optPerm[0]);
1030  thrust::for_each(edges.begin(), edges.end(), PermuteEdge(perm_array));
1031  }
1032 
1033  delete [] in_out_graph;
1034 
1035  return bandwidth;
1036 }
1037 
1038 
1039 // ----------------------------------------------------------------------------
1040 // Graph::partitionedRCM()
1041 //
1042 // This function implements the second-level Reverse Cuthill-McKee algorithm,
1043 // dealing with a specified partition
1044 // ----------------------------------------------------------------------------
1045 template <typename T>
1046 bool
1047 Graph<T>::partitionedRCM(EdgeIterator& begin,
1048  EdgeIterator& end,
1049  int node_begin,
1050  int node_end,
1051  IntVector& optReordering,
1052  IntVector& optPerm)
1053 {
1054  static std::vector<int> tmp_reordering;
1055  tmp_reordering.resize(m_n);
1056 
1057  static IntVector degrees;
1058  if (degrees.size() != m_n)
1059  degrees.resize(m_n, 0);
1060  else
1061  cusp::blas::fill(degrees, 0);
1062 
1063  // for(int i = node_begin; i < node_end; i++)
1064  // optReordering[i] = i;
1065  thrust::sequence(optReordering.begin()+node_begin, optReordering.begin()+node_end, node_begin);
1066 
1067  std::vector<int>* in_out_graph;
1068 
1069  in_out_graph = new std::vector<int> [node_end];
1070 
1071  int tmp_bdwidth = thrust::transform_reduce(begin, end, EdgeLength(), 0, thrust::maximum<int>());
1072  int opt_bdwidth = tmp_bdwidth;
1073  buildTopology(begin, end, degrees, in_out_graph);
1074 
1075  const int MAX_NUM_TRIAL = 10;
1076  const int BANDWIDTH_THRESHOLD = 128;
1077 
1078  static BoolVector tried(m_n, false);
1079  if (tried.size() != m_n)
1080  tried.resize(m_n, false);
1081  else
1082  cusp::blas::fill(tried, false);
1083 
1084  tried[node_begin] = true;
1085 
1086  CPUTimer timer;
1087  timer.Start();
1088 
1089  for (int num_trial = 0; num_trial < MAX_NUM_TRIAL; num_trial++) {
1090  std::queue<int> q;
1091  std::priority_queue<NodeType> pq;
1092 
1093  int tmp_node;
1094  BoolVector pushed(node_end, false);
1095 
1096  if (num_trial > 0) {
1097  tmp_node = rand() % (node_end - node_begin) + node_begin;
1098  while (tried[tmp_node])
1099  tmp_node = rand() % (node_end - node_begin) + node_begin;
1100  tried[tmp_node] = pushed[tmp_node] = true;
1101  q.push(tmp_node);
1102  }
1103 
1104  int left_cnt = node_end - node_begin;
1105  int j = node_begin, last = node_begin;
1106 
1107  while(left_cnt--) {
1108  if(q.empty()) {
1109  left_cnt++;
1110  int i;
1111  for(i = last; i < node_end; i++) {
1112  if(!pushed[i]) {
1113  q.push(i);
1114  pushed[i] = true;
1115  last = i;
1116  break;
1117  }
1118  }
1119  if(i < node_end) continue;
1120  fprintf(stderr, "Can never get here!\n");
1121  return false;
1122  }
1123 
1124  tmp_node = q.front();
1125  tmp_reordering[j] = tmp_node;
1126  j++;
1127 
1128  q.pop();
1129 
1130  std::vector<int>& tmp_vec = in_out_graph[tmp_node];
1131  int in_out_size = tmp_vec.size();
1132  if(in_out_size != 0) {
1133  for (int i = 0; i < in_out_size; i++) {
1134  int target_node = tmp_vec[i];
1135  if(!pushed[target_node]) {
1136  pushed[target_node] = true;
1137  pq.push(NodeType(target_node, degrees[target_node]));
1138  }
1139  }
1140  }
1141 
1142  while(!pq.empty()) {
1143  q.push(pq.top().m_idx);
1144  pq.pop();
1145  }
1146  }
1147 
1148  thrust::scatter(thrust::make_counting_iterator(node_begin),
1149  thrust::make_counting_iterator(node_end),
1150  tmp_reordering.begin() + node_begin,
1151  optPerm.begin());
1152 
1153  {
1154  int *perm_array = thrust::raw_pointer_cast(&optPerm[0]);
1155  tmp_bdwidth = thrust::transform_reduce(begin, end, PermutedEdgeLength(perm_array), 0, thrust::maximum<int>());
1156  }
1157 
1158  if(opt_bdwidth > tmp_bdwidth) {
1159  opt_bdwidth = tmp_bdwidth;
1160 
1161  thrust::copy(tmp_reordering.begin()+node_begin, tmp_reordering.begin()+node_end, optReordering.begin()+node_begin);
1162  }
1163 
1164  if(opt_bdwidth <= BANDWIDTH_THRESHOLD)
1165  break;
1166  }
1167 
1168  timer.Stop();
1169  m_timeRCM += timer.getElapsed();
1170 
1171  thrust::scatter(thrust::make_counting_iterator(node_begin),
1172  thrust::make_counting_iterator(node_end),
1173  optReordering.begin() + node_begin,
1174  optPerm.begin());
1175 
1176  {
1177  int* perm_array = thrust::raw_pointer_cast(&optPerm[0]);
1178  thrust::for_each(begin, end, PermuteEdge(perm_array));
1179  }
1180 
1181  delete [] in_out_graph;
1182 
1183  return true;
1184 }
1185 
1186 // --------------------------------------------------------------------------
1187 // Graph::buildTopology()
1188 //
1189 // This function builds the topology for the graph for RCM processing
1190 // --------------------------------------------------------------------------
1191 template <typename T>
1192 void
1193 Graph<T>::buildTopology(EdgeIterator& begin,
1194  EdgeIterator& end,
1195  IntVector& degrees,
1196  std::vector<int>* in_out_graph)
1197 {
1198  for(EdgeIterator edgeIt = begin; edgeIt != end; edgeIt++) {
1199  int from = edgeIt -> m_from, to = edgeIt -> m_to;
1200  if (from != to) {
1201  in_out_graph[from].push_back(to);
1202  in_out_graph[to].push_back(from);
1203  degrees[from]++;
1204  degrees[to]++;
1205  }
1206  }
1207 }
1208 
1209 // ----------------------------------------------------------------------------
1210 // Graph::find_minimum_match()
1211 // Graph::get_csc_matrix
1212 // Graph::init_reduced_cval
1213 // Graph::find_shortest_aug_path
1214 //
1215 // These are the worker functions for the MC64 algorithm.
1216 // ----------------------------------------------------------------------------
1217 
1218 // ----------------------------------------------------------------------------
1219 // Graph::find_minimum_match()
1220 //
1221 // This is the entry function of the core part of MC64 algorithm, which reorders
1222 // the matrix by finding the minimum match of a bipartite graph.
1223 // ----------------------------------------------------------------------------
1224 template <typename T>
1225 void
1226 Graph<T>::find_minimum_match(IntVector& mc64RowPerm,
1227  DoubleVector& mc64RowScale,
1228  DoubleVector& mc64ColScale)
1229 {
1230  // Allocate space for the output vectors.
1231  mc64RowPerm.resize(m_n, 0);
1232  mc64RowScale.resize(m_n + 1, 0);
1233  mc64ColScale.resize(m_n + 1, 0);
1234 
1235  // Allocate space for temporary vectors.
1236  IntVector row_ptr(m_n + 1, 0);
1237  IntVector rows(m_nnz);
1238  IntVector rev_match_nodes(m_nnz);
1239  DoubleVector c_val(m_nnz);
1240  DoubleVector max_val_in_col(m_n + 1, 0);
1241  IntVector prev(m_n + 1);
1242  IntVector matched(m_n + 1, 0);
1243  IntVector rev_matched(m_n + 1, 0);
1244 
1245  get_csc_matrix(row_ptr, rows, c_val, max_val_in_col);
1246  init_reduced_cval(row_ptr, rows, c_val, mc64RowScale, mc64ColScale, mc64RowPerm, rev_match_nodes, matched, rev_matched);
1247 
1248  IntVector irn(m_n);
1249  for(int i=0; i<m_n; i++) {
1250  if(rev_matched[i]) continue;
1251  bool success = find_shortest_aug_path(i, matched, rev_matched, mc64RowPerm, rev_match_nodes, row_ptr, rows, prev, mc64RowScale, mc64ColScale, c_val, irn);
1252  }
1253 
1254  {
1255  for (int i=0; i<m_n; i++)
1256  if (!matched[i])
1257  throw system_error(system_error::Matrix_singular, "Singular matrix found");
1258 #if 0
1259  IntVector unmatched_rows;
1260  IntVector unmatched_cols;
1261 
1262  for (int i=0; i<m_n; i++)
1263  if (!matched[i])
1264  unmatched_rows.push_back(i);
1265 
1266  for (int i=0; i<m_n; i++)
1267  if (!rev_matched[i])
1268  unmatched_cols.push_back(i);
1269 
1270  int unmatched_count = unmatched_rows.size();
1271 
1272  for (int i=0; i<unmatched_count; i++)
1273  mc64RowPerm[unmatched_rows[i]] = unmatched_cols[i];
1274 #endif
1275  }
1276 
1277  mc64RowScale.pop_back();
1278  mc64ColScale.pop_back();
1279  max_val_in_col.pop_back();
1280 
1281  thrust::transform(mc64RowScale.begin(), mc64RowScale.end(), mc64RowScale.begin(), Exponential());
1282  thrust::transform(thrust::make_transform_iterator(mc64ColScale.begin(), Exponential()),
1283  thrust::make_transform_iterator(mc64ColScale.end(), Exponential()),
1284  max_val_in_col.begin(),
1285  mc64ColScale.begin(),
1286  thrust::divides<double>());
1287 }
1288 
1289 // ----------------------------------------------------------------------------
1290 // Graph::get_csc_matrix()
1291 //
1292 // This function initializes the bipartite graph used in MC64. By specially
1293 // assigning weights on edges, MC64 can vary (this version is option 5 in intel's
1294 // MC64).
1295 // ----------------------------------------------------------------------------
1296 template<typename T>
1297 void
1298 Graph<T>::get_csc_matrix(IntVector& row_ptr,
1299  IntVector& rows,
1300  DoubleVector& c_val,
1301  DoubleVector& max_val_in_col)
1302 {
1303  BoolVector row_visited(m_n, false);
1304 
1305  cusp::blas::fill(c_val, LOC_INFINITY);
1306  for (EdgeIterator edgeIt = m_edges.begin(); edgeIt != m_edges.end(); edgeIt++) {
1307  row_ptr[edgeIt->m_to]++;
1308  row_visited[edgeIt->m_from] = true;
1309  }
1310 
1311  thrust::exclusive_scan(row_ptr.begin(), row_ptr.end(), row_ptr.begin());
1312 
1313  for (EdgeIterator edgeIt = m_edges.begin(); edgeIt != m_edges.end(); edgeIt++) {
1314  int from = edgeIt->m_from;
1315  int to = edgeIt->m_to;
1316  double tmp_val = fabs(edgeIt->m_val);
1317  rows[row_ptr[to]++] = from;
1318  if (max_val_in_col[to] < tmp_val)
1319  max_val_in_col[to] = tmp_val;
1320  }
1321 
1322  for (EdgeRevIterator edgeIt = m_edges.rbegin(); edgeIt != m_edges.rend(); edgeIt++) {
1323  int to = edgeIt -> m_to;
1324  c_val[--row_ptr[to]] = log(max_val_in_col[to] / fabs(edgeIt -> m_val));
1325  }
1326 }
1327 
1328 // ----------------------------------------------------------------------------
1329 // Graph::init_reduced_cval()
1330 //
1331 // This function assigns a (partial) match for speeding up MC64. If this function
1332 // were not to be called, we should start from zero: no match found at the beginning.
1333 // ----------------------------------------------------------------------------
1334 template <typename T>
1335 void
1336 Graph<T>::init_reduced_cval(IntVector& row_ptr,
1337  IntVector& rows,
1338  DoubleVector& c_val,
1339  DoubleVector& u_val,
1340  DoubleVector& v_val,
1341  IntVector& match_nodes,
1342  IntVector& rev_match_nodes,
1343  IntVector& matched,
1344  IntVector& rev_matched)
1345 {
1346  int i;
1347  int j;
1348  cusp::blas::fill(u_val, LOC_INFINITY);
1349  cusp::blas::fill(v_val, LOC_INFINITY);
1350 
1351  for(i = 0; i < m_n; i++) {
1352  int start_idx = row_ptr[i], end_idx = row_ptr[i+1];
1353  for(j = start_idx; j < end_idx; j++) {
1354  if (c_val[j] > LOC_INFINITY / 2.0) continue;
1355  int row = rows[j];
1356  if(u_val[row] > c_val[j]) {
1357  u_val[row] = c_val[j];
1358  }
1359  }
1360  }
1361 
1362  for(i = 0; i < m_n; i++) {
1363  int start_idx = row_ptr[i], end_idx = row_ptr[i+1];
1364  int min_idx = -1;
1365  for(j = start_idx; j < end_idx; j++) {
1366  if (c_val[j] > LOC_INFINITY / 2.0) continue;
1367  int row = rows[j];
1368  double tmp_val = c_val[j] - u_val[row];
1369  if(v_val[i] > tmp_val) {
1370  v_val[i] = tmp_val;
1371  min_idx = j;
1372  }
1373  }
1374  if(min_idx >= 0) {
1375  int tmp_row = rows[min_idx];
1376  if(!matched[tmp_row]) {
1377  rev_matched[i] = true;
1378  matched[tmp_row] = true;
1379  match_nodes[tmp_row] = i;
1380  rev_match_nodes[i] = min_idx;
1381  }
1382  }
1383  }
1384 
1385  for(i = 0; i < m_n; i++) {
1386  if (!matched[i])
1387  u_val[i] = 0.0;
1388  if (!rev_matched[i])
1389  v_val[i] = 0.0;
1390  }
1391 }
1392 
1393 // ----------------------------------------------------------------------------
1394 // Graph::find_shortest_aug_path()
1395 //
1396 // The core part of the algorithm of finding minimum match: finding the shortest
1397 // augmenting path and applying it.
1398 // ----------------------------------------------------------------------------
1399 template<typename T>
1400 bool
1401 Graph<T>::find_shortest_aug_path(int init_node,
1402  IntVector& matched,
1403  IntVector& rev_matched,
1404  IntVector& match_nodes,
1405  IntVector& rev_match_nodes,
1406  IntVector& row_ptr,
1407  IntVector& rows,
1408  IntVector& prev,
1409  DoubleVector& u_val,
1410  DoubleVector& v_val,
1411  DoubleVector& c_val,
1412  IntVector& irn)
1413 {
1414  bool success = false;
1415 
1416  static IntVector B(m_n+1, 0);
1417  int b_cnt = 0;
1418  static BoolVector inB(m_n+1, false);
1419 
1420  std::priority_queue<Dijkstra> Q;
1421  double lsp = 0.0;
1422  double lsap = LOC_INFINITY;
1423  int cur_node = init_node;
1424 
1425  int i;
1426 
1427  int isap = -1;
1428  int ksap = -1;
1429  prev[init_node] = -1;
1430 
1431  static DoubleVector d_vals(m_n+1, LOC_INFINITY);
1432  static BoolVector visited(m_n+1, false);
1433 
1434  while(1) {
1435  int start_cur = row_ptr[cur_node];
1436  int end_cur = row_ptr[cur_node+1];
1437  for(i = start_cur; i < end_cur; i++) {
1438  int cur_row = rows[i];
1439  if(inB[cur_row]) continue;
1440  if(c_val[i] > LOC_INFINITY / 2.0) continue;
1441  double reduced_cval = c_val[i] - u_val[cur_row] - v_val[cur_node];
1442  if (reduced_cval + 1e-10 < 0)
1443  throw system_error(system_error::Negative_MC64_weight, "Negative reduced weight in MC64.");
1444  double d_new = lsp + reduced_cval;
1445  if(d_new < lsap) {
1446  if(!matched[cur_row]) {
1447  lsap = d_new;
1448  isap = cur_row;
1449  ksap = i;
1450 
1451  match_nodes[isap] = cur_node;
1452  } else if (d_new < d_vals[cur_row]){
1453  d_vals[cur_row] = d_new;
1454  prev[match_nodes[cur_row]] = cur_node;
1455  Q.push(Dijkstra(cur_row, d_new));
1456  irn[cur_row] = i;
1457  }
1458  }
1459  }
1460 
1461  Dijkstra min_d;
1462  bool found = false;
1463 
1464  while(!Q.empty()) {
1465  min_d = Q.top();
1466  Q.pop();
1467  if(visited[min_d.m_idx])
1468  continue;
1469  found = true;
1470  break;
1471  }
1472  if(!found)
1473  break;
1474 
1475  int tmp_idx = min_d.m_idx;
1476  visited[tmp_idx] = true;
1477 
1478  lsp = min_d.m_val;
1479  if(lsap <= lsp) {
1480  visited[tmp_idx] = false;
1481  d_vals[tmp_idx] = LOC_INFINITY;
1482  break;
1483  }
1484  inB[tmp_idx] = true;
1485  B[b_cnt++] = tmp_idx;
1486 
1487  cur_node = match_nodes[tmp_idx];
1488  }
1489 
1490  if(lsap < LOC_INFINITY / 2.0f) {
1491  matched[isap] = true;
1492  cur_node = match_nodes[isap];
1493 
1494  v_val[cur_node] = c_val[ksap];
1495 
1496  while(prev[cur_node] >= 0) {
1497  match_nodes[isap] = cur_node;
1498 
1499  int next_ksap = rev_match_nodes[cur_node];
1500  int next_isap = rows[next_ksap];
1501  next_ksap = irn[next_isap];
1502 
1503  rev_match_nodes[cur_node] = ksap;
1504 
1505  cur_node = prev[cur_node];
1506  isap = next_isap;
1507  ksap = next_ksap;
1508  }
1509  match_nodes[isap] = cur_node;
1510  rev_match_nodes[cur_node] = ksap;
1511  rev_matched[cur_node] = true;
1512  success = true;
1513 
1514  for (i = 0; i < b_cnt; i++) {
1515  int tmp_row = B[i];
1516  int j_val = match_nodes[tmp_row];
1517  int tmp_k = rev_match_nodes[j_val];
1518  u_val[tmp_row] += d_vals[tmp_row] - lsap;
1519  v_val[j_val] = c_val[tmp_k] - u_val[tmp_row];
1520  d_vals[tmp_row] = LOC_INFINITY;
1521  visited[tmp_row] = false;
1522  inB[tmp_row] = false;
1523  }
1524 
1525  while(!Q.empty()) {
1526  Dijkstra tmpD = Q.top();
1527  Q.pop();
1528  d_vals[tmpD.m_idx] = LOC_INFINITY;
1529  }
1530  }
1531 
1532  return success;
1533 }
1534 
1535 
1536 } // namespace spike
1537 
1538 
1539 #endif
EdgeT< T > EdgeType
Definition: graph.h:102
EdgeT()
Definition: graph.h:71
cusp::array1d< T, cusp::host_memory > Vector
Definition: graph.h:94
void assembleBandedMatrix(int bandwidth, IntVector &ks_col, IntVector &ks_row, Vector &B, MatrixMap &typeMap, MatrixMap &bandedMatMap)
Definition: graph.h:679
int m_ori_idx
Definition: graph.h:85
int m_from
Definition: graph.h:81
EdgeVector::reverse_iterator EdgeRevIterator
Definition: graph.h:107
int m_to
Definition: graph.h:82
int m_idx
Definition: graph.h:62
friend bool operator>(const Node &a, const Node &b)
Definition: graph.h:54
int m_degree
Definition: graph.h:63
IntVector MatrixMap
Definition: graph.h:99
Definition: graph.h:49
std::vector< EdgeType > EdgeVector
Definition: graph.h:104
double m_val
Definition: graph.h:45
Dijkstra()
Definition: graph.h:33
virtual void Start()
Definition: timer.h:108
CPU and GPU timer classes.
double getTimeMC64() const
Definition: graph.h:111
virtual void Stop()
Definition: timer.h:112
Definition of the Spike system_error exception class.
EdgeVector::iterator EdgeIterator
Definition: graph.h:106
double getTimeRCM() const
Definition: graph.h:112
Vector MatrixMapF
Definition: graph.h:98
EdgeT(int from, int to, T val)
Definition: graph.h:72
friend bool operator<(const Dijkstra &a, const Dijkstra &b)
Definition: graph.h:36
void assembleOffDiagMatrices(int bandwidth, int numPartitions, Vector &WV_host, Vector &offDiags_host, IntVector &offDiagWidths_left, IntVector &offDiagWidths_right, IntVector &offDiagPerms_left, IntVector &offDiagPerms_right, MatrixMap &typeMap, MatrixMap &offDiagMap, MatrixMap &WVMap)
Definition: graph.h:476
cusp::array1d< bool, cusp::host_memory > BoolVector
Definition: graph.h:97
int dropOff(T frac, int maxBandwidth, T &frac_actual)
Definition: graph.h:408
Definition: graph.h:90
int m_idx
Definition: graph.h:44
friend bool operator<(const Node &a, const Node &b)
Definition: graph.h:58
Graph(bool trackReordering=false)
Definition: graph.h:306
Node(int idx, int degree)
Definition: graph.h:52
cusp::coo_matrix< int, T, cusp::host_memory > MatrixCoo
Definition: graph.h:93
friend bool operator>(const Dijkstra &a, const Dijkstra &b)
Definition: graph.h:40
EdgeT(int ori_idx, int from, int to, T val)
Definition: graph.h:74
int reorder(const MatrixCoo &Acoo, bool doMC64, bool scale, IntVector &optReordering, IntVector &optPerm, IntVector &mc64RowPerm, Vector &mc64RowScale, Vector &mc64ColScale, MatrixMapF &scaleMap, int &k_mc64)
Definition: graph.h:324
friend bool operator<(const EdgeT &a, const EdgeT &b)
Definition: graph.h:77
cusp::array1d< int, cusp::host_memory > IntVector
Definition: graph.h:96
Definition: graph.h:68
Definition: graph.h:30
void secondLevelReordering(int bandwidth, int numPartitions, IntVector &secondReorder, IntVector &secondPerm, IntVector &first_rows)
Definition: graph.h:606
cusp::array1d< double, cusp::host_memory > DoubleVector
Definition: graph.h:95
Definition: exception.h:22
virtual double getElapsed()
Definition: timer.h:116
double getTimeDropoff() const
Definition: graph.h:113
Dijkstra(int idx, double val)
Definition: graph.h:34
std::vector< NodeType > NodeVector
Definition: graph.h:103
CPU timer.
Definition: timer.h:101
Node NodeType
Definition: graph.h:101
T m_val
Definition: graph.h:83