SpikeGPU  1.0.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
precond.h
Go to the documentation of this file.
1 
5 #ifndef SPIKE_PRECOND_CUH
6 #define SPIKE_PRECOND_CUH
7 
8 #include <cusp/blas.h>
9 #include <cusp/print.h>
10 
11 #include <thrust/logical.h>
12 #include <thrust/functional.h>
13 
14 #include <spike/common.h>
15 #include <spike/graph.h>
16 #include <spike/timer.h>
17 #include <spike/strided_range.h>
23 #include <spike/device/shuffle.cuh>
25 
26 
27 namespace spike {
28 
30 
36 template <typename PrecVector>
37 class Precond
38 {
39 public:
40  typedef typename PrecVector::memory_space MemorySpace;
41  typedef typename PrecVector::value_type PrecValueType;
42  typedef typename PrecVector::iterator PrecVectorIterator;
43 
44  typedef typename cusp::array1d<int, MemorySpace> IntVector;
46  typedef typename cusp::array1d<PrecValueType, MemorySpace> MatrixMapF;
47 
48  typedef typename cusp::array1d<PrecValueType, cusp::host_memory> PrecVectorH;
49  typedef typename cusp::array1d<int, cusp::host_memory> IntVectorH;
51  typedef typename cusp::array1d<PrecValueType, cusp::host_memory> MatrixMapFH;
52 
53  typedef typename cusp::coo_matrix<int, PrecValueType, MemorySpace> PrecMatrixCoo;
54  typedef typename cusp::coo_matrix<int, PrecValueType, cusp::host_memory> PrecMatrixCooH;
55 
56 
57  Precond();
58 
59  Precond(int numPart,
60  bool reorder,
61  bool doMC64,
62  bool scale,
63  double dropOff_frac,
64  int maxBandwidth,
65  FactorizationMethod factMethod,
66  PreconditionerType precondType,
67  bool safeFactorization,
68  bool variableBandwidth,
69  bool trackReordering);
70 
71  Precond(const Precond& prec);
72 
73  ~Precond() {}
74 
75  Precond & operator = (const Precond &prec);
76 
77  double getTimeReorder() const {return m_time_reorder;}
78  double getTimeCPUAssemble() const {return m_time_cpu_assemble;}
79  double getTimeTransfer() const {return m_time_transfer;}
80  double getTimeToBanded() const {return m_time_toBanded;}
81  double getTimeCopyOffDiags() const {return m_time_offDiags;}
82  double getTimeBandLU() const {return m_time_bandLU;}
83  double getTimeBandUL() const {return m_time_bandUL;}
84  double gettimeAssembly() const {return m_time_assembly;}
85  double getTimeFullLU() const {return m_time_fullLU;}
86  double getTimeShuffle() const {return m_time_shuffle;}
87 
88  int getBandwidthReordering() const {return m_k_reorder;}
89  int getBandwidthMC64() const {return m_k_mc64;}
90  int getBandwidth() const {return m_k;}
91 
92  int getNumPartitions() const {return m_numPartitions;}
93  double getActualDropOff() const {return (double) m_dropOff_actual;}
94 
98  template <typename Matrix>
99  void setup(const Matrix& A);
100 
101  void update(const PrecVector& entries);
102 
103  void solve(PrecVector& v, PrecVector& z);
104 
105 private:
106  int m_numPartitions;
107  int m_n;
108  int m_k;
109 
110  bool m_reorder;
111  bool m_doMC64;
112  bool m_scale;
113  PrecValueType m_dropOff_frac;
114  int m_maxBandwidth;
115  FactorizationMethod m_factMethod;
116  PreconditionerType m_precondType;
117  bool m_safeFactorization;
118  bool m_variableBandwidth;
119  bool m_trackReordering;
120 
121  MatrixMap m_offDiagMap;
122  MatrixMap m_WVMap;
123  MatrixMap m_typeMap;
124  MatrixMap m_bandedMatMap;
125  MatrixMapF m_scaleMap;
126 
127  // Used in variable-bandwidth method only, host versions
128  IntVectorH m_ks_host;
129  IntVectorH m_ks_row_host;
130  IntVectorH m_ks_col_host;
131  IntVectorH m_offDiagWidths_left_host;
132  IntVectorH m_offDiagWidths_right_host;
133  IntVectorH m_first_rows_host;
134  IntVectorH m_BOffsets_host;
135 
136  // Used in variable-bandwidth method only
137  IntVector m_ks; // All half-bandwidths
138  IntVector m_offDiagWidths_left; // All left half-bandwidths in terms of rows
139  IntVector m_offDiagWidths_right; // All right half-bandwidths in terms of rows
140  IntVector m_offDiagPerms_left;
141  IntVector m_offDiagPerms_right;
142  IntVector m_first_rows;
143  IntVector m_spike_ks; // All half-bandwidths which are for spikes.
144  // m_spike_ks[i] = MAX ( m_ks[i] , m_ks[i+1] )
145  IntVector m_BOffsets; // Offsets in banded-matrix B
146  IntVector m_ROffsets; // Offsets in matrix R
147  IntVector m_WVOffsets; // Offsets in matrix WV
148  IntVector m_compB2Offsets; // Offsets in matrix compB2
149  IntVector m_partialBOffsets; // Offsets in matrix partialB
150 
151  IntVector m_optPerm; // RCM reordering
152  IntVector m_optReordering; // RCM reverse reordering
153 
154  IntVector m_secondReordering; // 2nd stage reverse reordering
155  IntVector m_secondPerm; // 2nd stage reordering
156 
157  PrecVector m_mc64RowScale; // MC64 row scaling
158  PrecVector m_mc64ColScale; // MC64 col scaling
159 
160  PrecVector m_B; // banded matrix (LU factors)
161  PrecVector m_offDiags; // contains the off-diagonal blocks of the original banded matrix
162  PrecVector m_R; // diagonal blocks in the reduced matrix (LU factors)
163 
164  PrecVectorH m_offDiags_host; // Used with second-stage reorder only, copy the offDiags in SpikeGragh
165  PrecVectorH m_WV_host;
166 
167  int m_k_reorder; // bandwidth after reordering
168  int m_k_mc64; // bandwidth after MC64
169 
170  PrecValueType m_dropOff_actual; // actual dropOff fraction achieved
171 
172  GPUTimer m_timer;
173  double m_time_reorder; // CPU time for matrix reordering
174  double m_time_cpu_assemble; // Time for acquiring the banded matrix and off-diagonal matrics on CPU
175  double m_time_transfer; // Time for data transferring from CPU to GPU
176  double m_time_toBanded; // GPU time for transformation or conversion to banded double
177  double m_time_offDiags; // GPU time to copy off-diagonal blocks
178  double m_time_bandLU; // GPU time for LU factorization of banded blocks
179  double m_time_bandUL; // GPU time for UL factorization of banded blocks
180  double m_time_assembly; // GPU time for assembling the reduced matrix
181  double m_time_fullLU; // GPU time for LU factorization of reduced matrix
182  double m_time_shuffle; // cumulative GPU time for permutation and scaling
183 
184  template <typename Matrix>
185  void transformToBandedMatrix(const Matrix& A);
186 
187  template <typename Matrix>
188  void convertToBandedMatrix(const Matrix& A);
189 
190  void extractOffDiagonal(PrecVector& mat_WV);
191 
192  void partBandedLU();
193  void partBandedLU_const();
194  void partBandedLU_one();
195  void partBandedLU_var();
196  void partBandedUL(PrecVector& B);
197 
198  void partBandedFwdSweep(PrecVector& v);
199  void partBandedFwdSweep_const(PrecVector& v);
200  void partBandedFwdSweep_var(PrecVector& v);
201  void partBandedBckSweep(PrecVector& v);
202  void partBandedBckSweep_const(PrecVector& v);
203  void partBandedBckSweep_var(PrecVector& v);
204 
205  void partFullLU();
206  void partFullLU_const();
207  void partFullLU_var();
208 
209  void partFullFwdSweep(PrecVector& v);
210  void partFullBckSweep(PrecVector& v);
211  void purifyRHS(PrecVector& v, PrecVector& res);
212 
213  void calculateSpikes(PrecVector& WV);
214  void calculateSpikes_const(PrecVector& WV);
215  void calculateSpikes_var(PrecVector& WV);
216  void calculateSpikes_var_old(PrecVector& WV);
217  void calculateSpikes(PrecVector& B2, PrecVector& WV);
218 
219  int adjustNumThreads(int inNumThreads);
220 
221 
222  void assembleReducedMat(PrecVector& WV);
223 
224  void copyLastPartition(PrecVector& B2);
225 
226  void leftTrans(PrecVector& v, PrecVector& z);
227  void rightTrans(PrecVector& v, PrecVector& z);
228  void permute(PrecVector& v, IntVector& perm, PrecVector& w);
229  void permuteAndScale(PrecVector& v, IntVector& perm, PrecVector& scale, PrecVector& w);
230  void scaleAndPermute(PrecVector& v, IntVector& perm, PrecVector& scale, PrecVector& w);
231 
232  void combinePermutation(IntVector& perm, IntVector& perm2, IntVector& finalPerm);
233  void getSRev(PrecVector& rhs, PrecVector& sol);
234 
235  bool hasZeroPivots(const PrecVectorIterator& start_B,
236  const PrecVectorIterator& end_B,
237  int k,
238  PrecValueType threshold);
239 };
240 
241 
242 // Functor objects
243 //
244 // TODO: figure out why I cannot make these private to Precond...
245 template<typename T>
246 struct Multiply: public thrust::unary_function<T, T>
247 {
248  __host__ __device__
249  T operator() (thrust::tuple<T, T> tu) {
250  return thrust::get<0>(tu) * thrust::get<1>(tu);
251  }
252 };
253 
254 template <typename T>
255 struct SmallerThan : public thrust::unary_function<T, bool>
256 {
257  SmallerThan(T threshold) : m_threshold(threshold) {}
258 
259  __host__ __device__
260  bool operator()(T val) {return std::abs(val) < m_threshold;}
261 
263 };
264 
265 
269 template <typename PrecVector>
271  bool reorder,
272  bool doMC64,
273  bool scale,
274  double dropOff_frac,
275  int maxBandwidth,
276  FactorizationMethod factMethod,
277  PreconditionerType precondType,
278  bool safeFactorization,
279  bool variableBandwidth,
280  bool trackReordering)
281 : m_numPartitions(numPart),
282  m_reorder(reorder),
283  m_doMC64(doMC64),
284  m_scale(scale),
285  m_dropOff_frac((PrecValueType)dropOff_frac),
286  m_maxBandwidth(maxBandwidth),
287  m_factMethod(factMethod),
288  m_precondType(precondType),
289  m_safeFactorization(safeFactorization),
290  m_variableBandwidth(variableBandwidth),
291  m_trackReordering(trackReordering),
292  m_k_reorder(0),
293  m_k_mc64(0),
294  m_dropOff_actual(0),
295  m_time_reorder(0),
296  m_time_cpu_assemble(0),
297  m_time_transfer(0),
298  m_time_toBanded(0),
299  m_time_offDiags(0),
300  m_time_bandLU(0),
301  m_time_bandUL(0),
302  m_time_assembly(0),
303  m_time_fullLU(0),
304  m_time_shuffle(0)
305 {
306 }
307 
311 template <typename PrecVector>
313 : m_reorder(false),
314  m_doMC64(false),
315  m_scale(false),
316  m_k_reorder(0),
317  m_k_mc64(0),
318  m_dropOff_actual(0),
319  m_maxBandwidth(std::numeric_limits<int>::max()),
320  m_time_reorder(0),
321  m_time_cpu_assemble(0),
322  m_time_transfer(0),
323  m_time_toBanded(0),
324  m_time_offDiags(0),
325  m_time_bandLU(0),
326  m_time_bandUL(0),
327  m_time_assembly(0),
328  m_time_fullLU(0),
329  m_time_shuffle(0)
330 {
331 }
332 
336 template <typename PrecVector>
338 : m_k_reorder(0),
339  m_k_mc64(0),
340  m_dropOff_actual(0),
341  m_time_reorder(0),
342  m_time_cpu_assemble(0),
343  m_time_transfer(0),
344  m_time_toBanded(0),
345  m_time_offDiags(0),
346  m_time_bandLU(0),
347  m_time_bandUL(0),
348  m_time_assembly(0),
349  m_time_fullLU(0),
350  m_time_shuffle(0)
351 {
352  m_numPartitions = prec.m_numPartitions;
353 
354  m_reorder = prec.m_reorder;
355  m_doMC64 = prec.m_doMC64;
356  m_scale = prec.m_scale;
357  m_dropOff_frac = prec.m_dropOff_frac;
358  m_maxBandwidth = prec.m_maxBandwidth;
359  m_factMethod = prec.m_factMethod;
360  m_precondType = prec.m_precondType;
361  m_safeFactorization = prec.m_safeFactorization;
362  m_variableBandwidth = prec.m_variableBandwidth;
363  m_trackReordering = prec.m_trackReordering;
364 }
365 
366 template <typename PrecVector>
369 {
370  m_numPartitions = prec.m_numPartitions;
371 
372  m_reorder = prec.m_reorder;
373  m_doMC64 = prec.m_doMC64;
374  m_scale = prec.m_scale;
375  m_dropOff_frac = prec.m_dropOff_frac;
376  m_maxBandwidth = prec.m_maxBandwidth;
377  m_factMethod = prec.m_factMethod;
378  m_precondType = prec.m_precondType;
379  m_safeFactorization = prec.m_safeFactorization;
380  m_variableBandwidth = prec.m_variableBandwidth;
381  m_trackReordering = prec.m_trackReordering;
382 
383  m_ks_host = prec.m_ks_host;
384  m_offDiagWidths_left_host = prec.m_offDiagWidths_left_host;
385  m_offDiagWidths_right_host = prec.m_offDiagWidths_right_host;
386  m_first_rows_host = prec.m_first_rows_host;
387  m_BOffsets_host = prec.m_BOffsets_host;
388 
389  m_time_shuffle = 0;
390  return *this;
391 }
392 
393 
404 template <typename PrecVector>
405 void
406 Precond<PrecVector>::update(const PrecVector& entries)
407 {
408  m_time_reorder = 0.0;
409 
410  m_timer.Start();
411 
412 
413  cusp::blas::fill(m_B, (PrecValueType) 0);
414 
415  thrust::scatter_if(
416  thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(entries.begin(), m_scaleMap.begin())), Multiply<PrecValueType>()),
417  thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(entries.end(), m_scaleMap.end())), Multiply<PrecValueType>()),
418  m_bandedMatMap.begin(),
419  m_typeMap.begin(),
420  m_B.begin()
421  );
422  m_timer.Stop();
423  m_time_cpu_assemble = m_timer.getElapsed();
424 
425  m_time_transfer = 0.0;
426 
428  if (m_k == 0)
429  return;
430 
431 
432  // If we are using a single partition, perform the LU factorization
433  // of the banded matrix and return.
434  if (m_precondType == Block || m_numPartitions == 1) {
435 
436  m_timer.Start();
437  partBandedLU();
438  m_timer.Stop();
439  m_time_bandLU = m_timer.getElapsed();
440 
442 
443  return;
444  }
445 
446  // We are using more than one partition, so we must assemble the
447  // truncated Spike reduced matrix R.
448  m_R.resize((2 * m_k) * (2 * m_k) * (m_numPartitions - 1));
449 
450  // Extract off-diagonal blocks from the banded matrix and store them
451  // in the array m_offDiags.
452  PrecVector mat_WV;
453  mat_WV.resize(2 * m_k * m_k * (m_numPartitions-1));
454  cusp::blas::fill(m_offDiags, (PrecValueType) 0);
455 
456  m_timer.Start();
457 
458  thrust::scatter_if(
459  thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(entries.begin(), m_scaleMap.begin())), Multiply<PrecValueType>()),
460  thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(entries.end(), m_scaleMap.end())), Multiply<PrecValueType>()),
461  m_offDiagMap.begin(),
462  m_typeMap.begin(),
463  m_offDiags.begin(),
464  thrust::logical_not<int>()
465  );
466 
467  thrust::scatter_if(
468  thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(entries.begin(), m_scaleMap.begin())), Multiply<PrecValueType>()),
469  thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(entries.end(), m_scaleMap.end())), Multiply<PrecValueType>()),
470  m_WVMap.begin(),
471  m_typeMap.begin(),
472  mat_WV.begin(),
473  thrust::logical_not<int>()
474  );
475  m_timer.Stop();
476  m_time_offDiags = m_timer.getElapsed();
477 
478 
479  switch (m_factMethod) {
480  case LU_only:
481  // In this case, we perform the partitioned LU factorization of D
482  // and use the L and U factors to compute both the bottom of the
483  // right spikes (using short sweeps) and the top of the left spikes
484  // (using full sweeps). Finally, we assemble the reduced matrix R.
485  {
486  m_timer.Start();
487  partBandedLU();
488  m_timer.Stop();
489  m_time_bandLU = m_timer.getElapsed();
490 
492 
493  m_timer.Start();
494  calculateSpikes(mat_WV);
495  assembleReducedMat(mat_WV);
496  m_timer.Stop();
497  m_time_assembly = m_timer.getElapsed();
498  }
499 
500  break;
501 
502  case LU_UL:
503  // In this case, we perform the partitioned LU factorization of D
504  // and use the L and U factors to compute the bottom of the right
505  // spikes (using short sweeps). We then perform a partitioned UL
506  // factorization, using a copy of the banded matrix, and use the
507  // resulting U and L factors to compute the top of the left spikes
508  // (using short sweeps). Finally, we assemble the reduced matrix R.
509  {
510  PrecVector B2 = m_B;
511 
512  cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
513 
514  m_timer.Start();
515  partBandedLU();
516  m_timer.Stop();
517  m_time_bandLU = m_timer.getElapsed();
518 
520 
521  m_timer.Start();
522  partBandedUL(B2);
523  m_timer.Stop();
524  m_time_bandUL = m_timer.getElapsed();
525 
527 
528  cudaDeviceSetCacheConfig(cudaFuncCachePreferNone);
529 
530  m_timer.Start();
531  calculateSpikes(B2, mat_WV);
532  assembleReducedMat(mat_WV);
533  copyLastPartition(B2);
534  m_timer.Stop();
535  m_time_assembly = m_timer.getElapsed();
536  }
537 
538  break;
539  }
540 
544 
545  // Perform (in-place) LU factorization of the reduced matrix.
546  m_timer.Start();
547  partFullLU();
548  m_timer.Stop();
549  m_time_fullLU = m_timer.getElapsed();
550 
552 }
553 
562 template <typename PrecVector>
563 template <typename Matrix>
564 void
566 {
567  m_n = A.num_rows;
568 
569  // Form the banded matrix based on the specified matrix, either through
570  // transformation (reordering and drop-off) or straight conversion.
571  if (m_reorder)
572  transformToBandedMatrix(A);
573  else
574  convertToBandedMatrix(A);
575 
577  if (m_k == 0)
578  return;
579 
580 
581  // If we are using a single partition, perform the LU factorization
582  // of the banded matrix and return.
583  if (m_precondType == Block || m_numPartitions == 1) {
584 
585  m_timer.Start();
586  partBandedLU();
587  m_timer.Stop();
588  m_time_bandLU = m_timer.getElapsed();
589 
591 
592  return;
593  }
594 
595  // We are using more than one partition, so we must assemble the
596  // truncated Spike reduced matrix R.
597  m_R.resize((2 * m_k) * (2 * m_k) * (m_numPartitions - 1));
598 
599  // Extract off-diagonal blocks from the banded matrix and store them
600  // in the array m_offDiags.
601  PrecVector mat_WV;
602  mat_WV.resize(2 * m_k * m_k * (m_numPartitions-1));
603 
604  m_timer.Start();
605  extractOffDiagonal(mat_WV);
606  m_timer.Stop();
607  m_time_offDiags = m_timer.getElapsed();
608 
609 
610  switch (m_factMethod) {
611  case LU_only:
612  // In this case, we perform the partitioned LU factorization of D
613  // and use the L and U factors to compute both the bottom of the
614  // right spikes (using short sweeps) and the top of the left spikes
615  // (using full sweeps). Finally, we assemble the reduced matrix R.
616  {
617  m_timer.Start();
618  partBandedLU();
619  m_timer.Stop();
620  m_time_bandLU = m_timer.getElapsed();
621 
623 
624  m_timer.Start();
625  calculateSpikes(mat_WV);
626  assembleReducedMat(mat_WV);
627  m_timer.Stop();
628  m_time_assembly = m_timer.getElapsed();
629  }
630 
631  break;
632 
633  case LU_UL:
634  // In this case, we perform the partitioned LU factorization of D
635  // and use the L and U factors to compute the bottom of the right
636  // spikes (using short sweeps). We then perform a partitioned UL
637  // factorization, using a copy of the banded matrix, and use the
638  // resulting U and L factors to compute the top of the left spikes
639  // (using short sweeps). Finally, we assemble the reduced matrix R.
640  {
641  PrecVector B2 = m_B;
642 
643  cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
644 
645  m_timer.Start();
646  partBandedLU();
647  m_timer.Stop();
648  m_time_bandLU = m_timer.getElapsed();
649 
651 
652  m_timer.Start();
653  partBandedUL(B2);
654  m_timer.Stop();
655  m_time_bandUL = m_timer.getElapsed();
656 
658 
659  cudaDeviceSetCacheConfig(cudaFuncCachePreferNone);
660 
661  m_timer.Start();
662  calculateSpikes(B2, mat_WV);
663  assembleReducedMat(mat_WV);
664  copyLastPartition(B2);
665  m_timer.Stop();
666  m_time_assembly = m_timer.getElapsed();
667  }
668 
669  break;
670  }
671 
675 
676  // Perform (in-place) LU factorization of the reduced matrix.
677  m_timer.Start();
678  partFullLU();
679  m_timer.Stop();
680  m_time_fullLU = m_timer.getElapsed();
681 }
682 
687 template <typename PrecVector>
688 void
690  PrecVector& z)
691 {
692  if (m_reorder) {
693  leftTrans(v, z);
694  static PrecVector buffer;
695  buffer.resize(m_n);
696  getSRev(z, buffer);
697  rightTrans(buffer, z);
698  } else {
699  cusp::blas::copy(v, z);
700  PrecVector buffer = z;
701  getSRev(buffer, z);
702  }
703 }
704 
708 template <typename PrecVector>
709 void
710 Precond<PrecVector>::getSRev(PrecVector& rhs,
711  PrecVector& sol)
712 {
713  if (m_k == 0) {
714  thrust::transform(rhs.begin(), rhs.end(), m_B.begin(), sol.begin(), thrust::divides<PrecValueType>());
715  return;
716  }
717 
718  if (m_numPartitions > 1 && m_precondType == Spike) {
719  if (!m_variableBandwidth) {
720  sol = rhs;
721  // Calculate modified RHS
722  partBandedFwdSweep(rhs);
723  partBandedBckSweep(rhs);
724 
725  // Solve reduced system
726  partFullFwdSweep(rhs);
727  partFullBckSweep(rhs);
728 
729  // Purify RHS
730  purifyRHS(rhs, sol);
731  } else {
732  static PrecVector buffer;
733  buffer.resize(m_n);
734  permute(rhs, m_secondReordering,buffer);
735  // Calculate modified RHS
736  partBandedFwdSweep(rhs);
737  partBandedBckSweep(rhs);
738 
739  permute(rhs, m_secondReordering, sol);
740 
741  // Solve reduced system
742  partFullFwdSweep(sol);
743  partFullBckSweep(sol);
744 
745  purifyRHS(sol, buffer);
746  permute(buffer, m_secondPerm, sol);
747  }
748  } else
749  sol = rhs;
750 
751  // Get purified solution
752  partBandedFwdSweep(sol);
753  partBandedBckSweep(sol);
754 }
755 
761 template <typename PrecVector>
762 void
763 Precond<PrecVector>::leftTrans(PrecVector& v,
764  PrecVector& z)
765 {
766  if (m_scale)
767  scaleAndPermute(v, m_optPerm, m_mc64RowScale, z);
768  else
769  permute(v, m_optPerm, z);
770 }
771 
776 template <typename PrecVector>
777 void
778 Precond<PrecVector>::rightTrans(PrecVector& v,
779  PrecVector& z)
780 {
781  if (m_scale)
782  permuteAndScale(v, m_optReordering, m_mc64ColScale, z);
783  else
784  permute(v, m_optReordering, z);
785 }
786 
791 template <typename PrecVector>
792 void
793 Precond<PrecVector>::permute(PrecVector& v,
794  IntVector& perm,
795  PrecVector& w)
796 {
797  m_timer.Start();
798  thrust::scatter(v.begin(), v.end(), perm.begin(), w.begin());
799  m_timer.Stop();
800  m_time_shuffle += m_timer.getElapsed();
801 }
802 
807 template <typename PrecVector>
808 void
809 Precond<PrecVector>::permuteAndScale(PrecVector& v,
810  IntVector& perm,
811  PrecVector& scale,
812  PrecVector& w)
813 {
814  m_timer.Start();
815 
816  thrust::scatter(
817  thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(v.begin(), thrust::make_permutation_iterator(scale.begin(), perm.begin()))), Multiply<PrecValueType>()),
818  thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(v.end(), thrust::make_permutation_iterator(scale.end(), perm.end()))), Multiply<PrecValueType>()),
819  perm.begin(),
820  w.begin()
821  );
822 
823  m_timer.Stop();
824  m_time_shuffle += m_timer.getElapsed();
825 }
826 
831 template <typename PrecVector>
832 void
833 Precond<PrecVector>::scaleAndPermute(PrecVector& v,
834  IntVector& perm,
835  PrecVector& scale,
836  PrecVector& w)
837 {
838  m_timer.Start();
839  thrust::scatter(
840  thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(v.begin(), scale.begin())), Multiply<PrecValueType>()),
841  thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(v.end(), scale.end())), Multiply<PrecValueType>()),
842  perm.begin(),
843  w.begin()
844  );
845  m_timer.Stop();
846  m_time_shuffle += m_timer.getElapsed();
847 }
848 
852 template <typename PrecVector>
853 void
854 Precond<PrecVector>::combinePermutation(IntVector& perm,
855  IntVector& perm2,
856  IntVector& finalPerm)
857 {
858  m_timer.Start();
859  thrust::gather(perm.begin(), perm.end(), perm2.begin(), finalPerm.begin());
860  m_timer.Stop();
861  m_time_shuffle += m_timer.getElapsed();
862 }
863 
864 
882 template <typename PrecVector>
883 template <typename Matrix>
884 void
885 Precond<PrecVector>::transformToBandedMatrix(const Matrix& A)
886 {
887  CPUTimer reorder_timer, assemble_timer, transfer_timer;
888 
889  transfer_timer.Start();
890 
891  // Reorder the matrix and apply drop-off. For this, we convert the
892  // input matrix to COO format and copy it on the host.
893  PrecMatrixCooH Acoo = A;
894  transfer_timer.Stop();
895  m_time_transfer = transfer_timer.getElapsed();
896 
897  PrecVectorH B;
898  IntVectorH optReordering;
899  IntVectorH optPerm;
900  IntVectorH mc64RowPerm;
901  PrecVectorH mc64RowScale;
902  PrecVectorH mc64ColScale;
903  IntVectorH secondReorder;
904  IntVectorH secondPerm;
905 
906  Graph<PrecValueType> graph(m_trackReordering);
907 
908  IntVectorH offDiagPerms_left;
909  IntVectorH offDiagPerms_right;
910 
911  MatrixMapH offDiagMap;
912  MatrixMapH WVMap;
913  MatrixMapH typeMap;
914  MatrixMapH bandedMatMap;
915  MatrixMapFH scaleMap;
916 
917 
918  reorder_timer.Start();
919  m_k_reorder = graph.reorder(Acoo, m_doMC64, m_scale, optReordering, optPerm, mc64RowPerm, mc64RowScale, mc64ColScale, scaleMap, m_k_mc64);
920  reorder_timer.Stop();
921 
922  m_time_reorder += reorder_timer.getElapsed();
923 
924  int dropped = 0;
925 
926  if (m_k_reorder > m_maxBandwidth || m_dropOff_frac > 0)
927  dropped = graph.dropOff(m_dropOff_frac, m_maxBandwidth, m_dropOff_actual);
928  else
929  m_dropOff_actual = 0;
930 
931  // FIXME: this is a little bit problematic when for some off-diagonals, there is no element at all.
932  m_k = m_k_reorder - dropped;
933 
934 
935  // Verify that the required number of partitions is consistent with the
936  // problem size and half-bandwidth. If 'n' is the smallest partition size,
937  // the following condition must be satisfied:
938  // K+1 <= n (for Spike algorithm)
939  // These imply a maximum allowable number of partitions.
940  int maxNumPartitions = std::max(m_n / (m_k + 1), 1);
941  m_numPartitions = std::min(m_numPartitions, maxNumPartitions);
942 
943  // If there is just one partition, force using constant bandwidth method.
944  if (m_numPartitions == 1 || m_k == 0)
945  m_variableBandwidth = false;
946 
947 
948  // Assemble the banded matrix.
949  if (m_variableBandwidth) {
950  assemble_timer.Start();
951  graph.assembleOffDiagMatrices(m_k, m_numPartitions, m_WV_host, m_offDiags_host, m_offDiagWidths_left_host, m_offDiagWidths_right_host, offDiagPerms_left, offDiagPerms_right, typeMap, offDiagMap, WVMap);
952  assemble_timer.Stop();
953  m_time_cpu_assemble += assemble_timer.getElapsed();
954 
955  reorder_timer.Start();
956  graph.secondLevelReordering(m_k, m_numPartitions, secondReorder, secondPerm, m_first_rows_host);
957  reorder_timer.Stop();
958  m_time_reorder += reorder_timer.getElapsed();
959 
960  assemble_timer.Start();
961  graph.assembleBandedMatrix(m_k, m_numPartitions, m_ks_col_host, m_ks_row_host, B,
962  m_ks_host, m_BOffsets_host,
963  typeMap, bandedMatMap);
964  assemble_timer.Stop();
965  m_time_cpu_assemble += assemble_timer.getElapsed();
966  } else {
967  assemble_timer.Start();
968  graph.assembleBandedMatrix(m_k, m_ks_col_host, m_ks_row_host, B, typeMap, bandedMatMap);
969  assemble_timer.Stop();
970  m_time_cpu_assemble += assemble_timer.getElapsed();
971  }
972 
973  transfer_timer.Start();
974 
975  // Copy the banded matrix and permutation data to the device.
976  m_optReordering = optReordering;
977  m_optPerm = optPerm;
978 
979  if (m_scale) {
980  m_mc64RowScale = mc64RowScale;
981  m_mc64ColScale = mc64ColScale;
982  }
983 
984  m_B = B;
985 
986  if (m_variableBandwidth) {
987  m_ks = m_ks_host;
988  m_offDiagWidths_left = m_offDiagWidths_left_host;
989  m_offDiagWidths_right = m_offDiagWidths_right_host;
990  m_offDiagPerms_left = offDiagPerms_left;
991  m_offDiagPerms_right = offDiagPerms_right;
992  m_BOffsets = m_BOffsets_host;
993 
994  m_spike_ks.resize(m_numPartitions - 1);
995  m_ROffsets.resize(m_numPartitions - 1);
996  m_WVOffsets.resize(m_numPartitions - 1);
997  m_compB2Offsets.resize(m_numPartitions - 1);
998  m_partialBOffsets.resize(m_numPartitions-1);
999 
1000  cusp::blas::fill(m_spike_ks, m_k);
1001  thrust::sequence(m_ROffsets.begin(), m_ROffsets.end(), 0, 4*m_k*m_k);
1002  thrust::sequence(m_WVOffsets.begin(), m_WVOffsets.end(), 0, 2*m_k*m_k);
1003  thrust::sequence(m_partialBOffsets.begin(), m_partialBOffsets.end(), 0, 2 * (m_k+1) * (2*m_k+ 1));
1004  thrust::sequence(m_compB2Offsets.begin(), m_compB2Offsets.end(), 0, 2 * m_k * (2 * m_k + 1));
1005  }
1006 
1007  if (m_variableBandwidth) {
1008  IntVector buffer2(m_n);
1009  m_secondReordering = secondReorder;
1010  combinePermutation(m_secondReordering, m_optReordering, buffer2);
1011  m_optReordering = buffer2;
1012 
1013  m_secondPerm = secondPerm;
1014  combinePermutation(m_optPerm, m_secondPerm, buffer2);
1015  m_optPerm = buffer2;
1016 
1017  m_first_rows = m_first_rows_host;
1018  }
1019 
1020  {
1021  IntVector buffer = mc64RowPerm, buffer2(m_n);
1022  combinePermutation(buffer, m_optPerm, buffer2);
1023  m_optPerm = buffer2;
1024  }
1025 
1026  if (m_trackReordering) {
1027  m_offDiagMap = offDiagMap;
1028  m_WVMap = WVMap;
1029  m_typeMap = typeMap;
1030  m_bandedMatMap = bandedMatMap;
1031  m_scaleMap = scaleMap;
1032  }
1033 
1034  transfer_timer.Stop();
1035  m_time_transfer += transfer_timer.getElapsed();
1036 }
1037 
1043 template <typename PrecVector>
1044 template <typename Matrix>
1045 void
1046 Precond<PrecVector>::convertToBandedMatrix(const Matrix& A)
1047 {
1048  // Convert matrix to COO format.
1049  PrecMatrixCoo Acoo = A;
1050  int n = Acoo.num_rows;
1051  int nnz = Acoo.num_entries;
1052 
1053  // Calculate bandwidth. Note that we use an explicit code block so
1054  // that the temporary array 'buffer' is freed before we resize m_B
1055  // (otherwise, we may run out of memory).
1056  {
1057  IntVector buffer(nnz);
1058  cusp::blas::axpby(Acoo.row_indices, Acoo.column_indices, buffer, 1, -1);
1059  m_k = cusp::blas::nrmmax(buffer);
1060  }
1061 
1062  // Verify that the required number of partitions is consistent with the
1063  // problem size and half-bandwidth. If 'n' is the smallest partition size,
1064  // the following two conditions must be satisfied:
1065  // (1) K+1 <= n (for Spike algorithm)
1066  // (2) 2*K <= n (for current implementation of UL)
1067  // These imply a maximum allowable number of partitions.
1068  int maxNumPartitions = std::max(1, m_n / std::max(m_k + 1, 2 * m_k));
1069  m_numPartitions = std::min(m_numPartitions, maxNumPartitions);
1070 
1071  // If there is just one partition, force using constant-bandwidth method.
1072  if (m_numPartitions == 1)
1073  m_variableBandwidth = false;
1074 
1075  // Set the size and load the banded matrix into m_B.
1076  m_B.resize((2*m_k+1)*n);
1077 
1078  int blockX = nnz, gridX = 1, gridY = 1;
1079  kernelConfigAdjust(blockX, gridX, gridY, BLOCK_SIZE, MAX_GRID_DIMENSION);
1080  dim3 grids(gridX, gridY);
1081 
1082  int* d_rows = thrust::raw_pointer_cast(&(Acoo.row_indices[0]));
1083  int* d_cols = thrust::raw_pointer_cast(&(Acoo.column_indices[0]));
1084  PrecValueType* d_vals = thrust::raw_pointer_cast(&(Acoo.values[0]));
1085  PrecValueType* dB = thrust::raw_pointer_cast(&m_B[0]);
1086 
1087  m_timer.Start();
1088  device::copyFromCOOMatrixToBandedMatrix<<<grids, blockX>>>(nnz, m_k, d_rows, d_cols, d_vals, dB);
1089  m_timer.Stop();
1090  m_time_toBanded = m_timer.getElapsed();
1091 }
1092 
1093 
1099 template <typename PrecVector>
1100 void
1101 Precond<PrecVector>::extractOffDiagonal(PrecVector& mat_WV)
1102 {
1103  // If second-level reordering is enabled, the off-diagonal matrices are already in the host.
1104  if (m_variableBandwidth) {
1105  mat_WV = m_WV_host;
1106  m_offDiags = m_offDiags_host;
1107  return;
1108  }
1109 
1110  m_offDiags.resize(2 * m_k * m_k * (m_numPartitions - 1));
1111 
1112  PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[0]);
1113  PrecValueType* p_WV = thrust::raw_pointer_cast(&mat_WV[0]);
1114  PrecValueType* p_offDiags = thrust::raw_pointer_cast(&m_offDiags[0]);
1115  int* p_ks = thrust::raw_pointer_cast(&m_ks[0]);
1116 
1117  int partSize = m_n / m_numPartitions;
1118  int remainder = m_n % m_numPartitions;
1119 
1120  dim3 grids(m_k, m_numPartitions-1);
1121 
1122  if (m_k > 1024)
1123  device::copydWV_general<PrecValueType><<<grids, 512>>>(m_k, p_B, p_WV, p_offDiags, partSize, m_numPartitions, remainder);
1124  else if (m_k > 32)
1125  device::copydWV_g32<PrecValueType><<<grids, m_k>>>(m_k, p_B, p_WV, p_offDiags, partSize, m_numPartitions, remainder);
1126  else
1127  device::copydWV<PrecValueType><<<m_numPartitions-1, m_k*m_k>>>(m_k, p_B, p_WV, p_offDiags, partSize, m_numPartitions, remainder);
1128 }
1129 
1140 template <typename PrecVector>
1141 void
1142 Precond<PrecVector>::partFullLU()
1143 {
1144  if (!m_variableBandwidth)
1145  partFullLU_const();
1146  else
1147  partFullLU_var();
1148 }
1149 
1150 
1151 template <typename PrecVector>
1152 void
1153 Precond<PrecVector>::partFullLU_const()
1154 {
1155  PrecValueType* d_R = thrust::raw_pointer_cast(&m_R[0]);
1156  int two_k = 2 * m_k;
1157 
1158  // The first k rows of each diagonal block do not need a division step and
1159  // always use a pivot = 1.
1160  {
1161  dim3 grids(m_k, m_numPartitions-1);
1162 
1163  if( m_k > 1024)
1164  device::fullLU_sub_spec_general<PrecValueType><<<grids, 512>>>(d_R, two_k, m_k);
1165  else
1166  device::fullLU_sub_spec<PrecValueType><<<grids, m_k>>>(d_R, two_k, m_k);
1167  }
1168 
1169  // The following k rows of each diagonal block require first a division by
1170  // the pivot.
1171  if (m_safeFactorization) {
1172  for(int i = m_k; i < two_k-1; i++) {
1173  int threads = two_k-1-i;
1174  dim3 grids(two_k-1-i, m_numPartitions-1);
1175 
1176  if(threads > 1024) {
1177  device::fullLU_div_safe_general<PrecValueType><<<m_numPartitions-1, 512>>>(d_R, m_k, two_k, i);
1178  device::fullLU_sub_general<PrecValueType><<<grids, 512>>>(d_R, m_k, two_k, i);
1179  } else {
1180  device::fullLU_div_safe<PrecValueType><<<m_numPartitions-1, threads>>>(d_R, two_k, i);
1181  device::fullLU_sub<PrecValueType><<<grids, threads>>>(d_R, two_k, i);
1182  }
1183  }
1184  } else {
1185  for(int i = m_k; i < two_k-1; i++) {
1186  int threads = two_k-1-i;
1187  dim3 grids(two_k-1-i, m_numPartitions-1);
1188 
1189  if(threads > 1024) {
1190  device::fullLU_div_general<PrecValueType><<<m_numPartitions-1, 512>>>(d_R, m_k, two_k, i);
1191  device::fullLU_sub_general<PrecValueType><<<grids, 512>>>(d_R, m_k, two_k, i);
1192  } else {
1193  device::fullLU_div<PrecValueType><<<m_numPartitions-1, threads>>>(d_R, two_k, i);
1194  device::fullLU_sub<PrecValueType><<<grids, threads>>>(d_R, two_k, i);
1195  }
1196  }
1197  }
1198 }
1199 
1200 template <typename PrecVector>
1201 void
1202 Precond<PrecVector>::partFullLU_var()
1203 {
1204  PrecValueType* d_R = thrust::raw_pointer_cast(&m_R[0]);
1205  int* p_spike_ks = thrust::raw_pointer_cast(&m_spike_ks[0]);
1206  int* p_ROffsets = thrust::raw_pointer_cast(&m_ROffsets[0]);
1207 
1208  int two_k = 2 * m_k;
1209 
1210  // The first k rows of each diagonal block do not need a division step and
1211  // always use a pivot = 1.
1212  {
1213  dim3 grids(m_k, m_numPartitions-1);
1214 
1215  if( m_k > 1024)
1216  device::var::fullLU_sub_spec_general<PrecValueType><<<grids, 512>>>(d_R, p_spike_ks, p_ROffsets);
1217  else
1218  device::var::fullLU_sub_spec<PrecValueType><<<grids, m_k>>>(d_R, p_spike_ks, p_ROffsets);
1219  }
1220 
1221  // The following k rows of each diagonal block require first a division by
1222  // the pivot.
1223  if (m_safeFactorization) {
1224  for(int i = m_k; i < two_k-1; i++) {
1225  int threads = two_k-1-i;
1226  dim3 grids(two_k-1-i, m_numPartitions-1);
1227 
1228  if(threads > 1024) {
1229  device::var::fullLU_div_safe_general<PrecValueType><<<m_numPartitions-1, 512>>>(d_R, p_spike_ks, p_ROffsets, i);
1230  device::var::fullLU_sub_general<PrecValueType><<<grids, 512>>>(d_R, p_spike_ks, p_ROffsets, i);
1231  } else {
1232  device::var::fullLU_div_safe<PrecValueType><<<m_numPartitions-1, threads>>>(d_R, p_spike_ks, p_ROffsets, i);
1233  device::var::fullLU_sub<PrecValueType><<<grids, threads>>>(d_R, p_spike_ks, p_ROffsets, i);
1234  }
1235  }
1236  } else {
1237  for(int i = m_k; i < two_k-1; i++) {
1238  int threads = two_k-1-i;
1239  dim3 grids(two_k-1-i, m_numPartitions-1);
1240 
1241  if(threads > 1024) {
1242  device::var::fullLU_div_general<PrecValueType><<<m_numPartitions-1, 512>>>(d_R, p_spike_ks, p_ROffsets, i);
1243  device::var::fullLU_sub_general<PrecValueType><<<grids, 512>>>(d_R, p_spike_ks, p_ROffsets, i);
1244  } else {
1245  device::var::fullLU_div<PrecValueType><<<m_numPartitions-1, threads>>>(d_R, p_spike_ks, p_ROffsets, i);
1246  device::var::fullLU_sub<PrecValueType><<<grids, threads>>>(d_R, p_spike_ks, p_ROffsets, i);
1247  }
1248  }
1249  }
1250 
1251  {
1252  dim3 grids(m_k-1, m_numPartitions-1);
1253  if (m_k >= 1024)
1254  device::var::fullLU_post_divide_general<PrecValueType><<<grids, 512>>>(d_R, p_spike_ks, p_ROffsets);
1255  else
1256  device::var::fullLU_post_divide<PrecValueType><<<grids, m_k-1>>>(d_R, p_spike_ks, p_ROffsets);
1257  }
1258 }
1259 
1267 template <typename PrecVector>
1268 void
1269 Precond<PrecVector>::partBandedLU()
1270 {
1271  if (m_variableBandwidth) {
1272  // Variable bandwidth method. Note that in this situation, there
1273  // must be more than one partition.
1274  partBandedLU_var();
1275  } else {
1276  // Constant bandwidth method.
1277  if (m_numPartitions > 1)
1278  partBandedLU_const();
1279  else
1280  partBandedLU_one();
1281  }
1282 }
1283 
1284 template <typename PrecVector>
1285 void
1286 Precond<PrecVector>::partBandedLU_one()
1287 {
1288  // As the name implies, this function can only be called if we arte using a single
1289  // partition. In this case, the entire banded matrix m_B is LU factorized.
1290 
1291  PrecValueType* dB = thrust::raw_pointer_cast(&m_B[0]);
1292 
1293  if (m_ks_col_host.size() != m_n)
1294  m_ks_col_host.resize(m_n, m_k);
1295 
1296  if (m_ks_row_host.size() != m_n)
1297  m_ks_row_host.resize(m_n, m_k);
1298 
1299  if(m_k >= CRITICAL_THRESHOLD) {
1300  int threadsNum = 0;
1301 
1302  for (int st_row = 0; st_row < m_n-1; st_row++) {
1303  threadsNum = m_ks_col_host[st_row];
1304  // if (threadsNum > m_n - st_row - 1)
1305  // threadsNum = m_n - st_row - 1;
1306  int blockX = m_ks_row_host[st_row];
1307  // if (blockX > m_n - st_row - 1)
1308  // blockX = m_n - st_row - 1;
1309  if (threadsNum > 1024) {
1310  if (m_safeFactorization)
1311  device::bandLU_critical_div_onePart_safe_general<PrecValueType><<<1, 512>>>(dB, st_row, m_k, threadsNum);
1312  else
1313  device::bandLU_critical_div_onePart_general<PrecValueType><<<threadsNum/512+1, 512>>>(dB, st_row, m_k, threadsNum);
1314 
1315  device::bandLU_critical_sub_onePart_general<PrecValueType><<<blockX, 512>>>(dB, st_row, m_k, threadsNum);
1316  } else {
1317  if (m_safeFactorization)
1318  device::bandLU_critical_div_onePart_safe<PrecValueType><<<1, threadsNum>>>(dB, st_row, m_k);
1319  else
1320  device::bandLU_critical_div_onePart<PrecValueType><<<1, threadsNum>>>(dB, st_row, m_k);
1321 
1322  device::bandLU_critical_sub_onePart<PrecValueType><<<blockX, threadsNum>>>(dB, st_row, m_k);
1323  }
1324  }
1325  } else if (m_k > 27) {
1326  if (m_safeFactorization)
1327  device::bandLU_g32_safe<PrecValueType><<<1, 512>>>(dB, m_k, m_n, 0);
1328  else
1329  device::bandLU_g32<PrecValueType><<<1, 512>>>(dB, m_k, m_n, 0);
1330  } else {
1331  if (m_safeFactorization)
1332  device::bandLU_safe<PrecValueType><<<1, m_k * m_k>>>(dB, m_k, m_n, 0);
1333  else
1334  device::bandLU<PrecValueType><<<1, m_k * m_k>>>(dB, m_k, m_n, 0);
1336  }
1337 
1338 
1339  if (m_safeFactorization)
1340  device::boostLastPivot<PrecValueType><<<1, 1>>>(dB, m_n, m_k, m_n, 0);
1341 
1342 
1343  // If not using safe factorization, check the factorized banded matrix for any
1344  // zeros on its diagonal (this means a zero pivot).
1345  if (!m_safeFactorization && hasZeroPivots(m_B.begin(), m_B.end(), m_k, (PrecValueType) BURST_VALUE))
1346  throw system_error(system_error::Zero_pivoting, "Found a pivot equal to zero (partBandedLU_one).");
1347 
1348 
1349  int gridX = m_n, gridY = 1;
1350  kernelConfigAdjust(gridX, gridY, MAX_GRID_DIMENSION);
1351  dim3 grids(gridX, gridY);
1352  if (m_k > 1024)
1353  device::bandLU_post_divide_general<PrecValueType><<<grids, 512>>>(dB, m_k, m_n);
1354  else
1355  device::bandLU_post_divide<PrecValueType><<<grids, m_k>>>(dB, m_k, m_n);
1356 }
1357 
1358 template <typename PrecVector>
1359 void
1360 Precond<PrecVector>::partBandedLU_const()
1361 {
1362  // Note that this function is called only if there are two or more partitions.
1363  // Moreover, if the factorization method is LU_only, all diagonal blocks in
1364  // each partition are LU factorized. If the method is LU_UL, then the diagonal
1365  // block in the last partition is *not* factorized.
1366 
1367  PrecValueType* dB = thrust::raw_pointer_cast(&m_B[0]);
1368 
1369  int n_eff = m_n;
1370  int numPart_eff = m_numPartitions;
1371 
1372  if (m_factMethod == LU_UL && m_numPartitions > 1 && m_precondType != Block) {
1373  n_eff -= m_n / m_numPartitions;
1374  numPart_eff--;
1375  }
1376 
1377  int partSize = n_eff / numPart_eff;
1378  int remainder = n_eff % numPart_eff;
1379 
1380  if(m_k >= CRITICAL_THRESHOLD) {
1381  int final_partition_size = partSize + 1;
1382  int threadsNum = 0;
1383 
1384  for (int st_row = 0; st_row < final_partition_size - 1; st_row++) {
1385  if (st_row == 0) {
1386  if (remainder == 0) continue;
1387  threadsNum = m_k;
1388  if (threadsNum > final_partition_size - 1)
1389  threadsNum = final_partition_size - 1;
1390  if (threadsNum > 1024) {
1391  if (m_safeFactorization)
1392  device::bandLU_critical_div_safe_general<PrecValueType><<<remainder, 512>>>(dB, st_row, m_k, partSize, remainder);
1393  else
1394  device::bandLU_critical_div_general<PrecValueType><<<remainder, 512>>>(dB, st_row, m_k, partSize, remainder);
1395  dim3 tmpGrids(threadsNum, remainder);
1396  device::bandLU_critical_sub_general<PrecValueType><<<tmpGrids, 512>>>(dB, st_row, m_k, partSize, remainder);
1397  } else {
1398  if (m_safeFactorization)
1399  device::bandLU_critical_div_safe<PrecValueType><<<remainder, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1400  else
1401  device::bandLU_critical_div<PrecValueType><<<remainder, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1402 
1403  dim3 tmpGrids(threadsNum, remainder);
1404  device::bandLU_critical_sub<PrecValueType><<<tmpGrids, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1405  }
1406  } else {
1407  threadsNum = m_k;
1408  if (threadsNum > final_partition_size - st_row - 1)
1409  threadsNum = final_partition_size - st_row - 1;
1410  if (threadsNum > 1024) {
1411  if (m_safeFactorization)
1412  device::bandLU_critical_div_safe_general<PrecValueType><<<numPart_eff, 512>>>(dB, st_row, m_k, partSize, remainder);
1413  else
1414  device::bandLU_critical_div_general<PrecValueType><<<numPart_eff, 512>>>(dB, st_row, m_k, partSize, remainder);
1415 
1416  dim3 tmpGrids(threadsNum, numPart_eff);
1417  device::bandLU_critical_sub_general<PrecValueType><<<tmpGrids, 512>>>(dB, st_row, m_k, partSize, remainder);
1418  } else {
1419  if (m_safeFactorization)
1420  device::bandLU_critical_div_safe<PrecValueType><<<numPart_eff, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1421  else
1422  device::bandLU_critical_div<PrecValueType><<<numPart_eff, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1423 
1424  dim3 tmpGrids(threadsNum, numPart_eff);
1425  device::bandLU_critical_sub<PrecValueType><<<tmpGrids, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1426  }
1427  }
1428  }
1429  } else if (m_k > 27) {
1430  if (m_safeFactorization)
1431  device::bandLU_g32_safe<PrecValueType><<<numPart_eff, 512>>>(dB, m_k, partSize, remainder);
1432  else
1433  device::bandLU_g32<PrecValueType><<<numPart_eff, 512>>>(dB, m_k, partSize, remainder);
1434  } else {
1435  if (m_safeFactorization)
1436  device::bandLU_safe<PrecValueType><<<numPart_eff, m_k * m_k>>>(dB, m_k, partSize, remainder);
1437  else
1438  device::bandLU<PrecValueType><<<numPart_eff, m_k * m_k>>>(dB, m_k, partSize, remainder);
1440  }
1441 
1442 
1443  // If not using safe factorization, check the factorized banded matrix for any
1444  // zeros on its diagonal (this means a zero pivot). Note that we must only check
1445  // the diagonal blocks corresponding to the partitions for which LU was applied.
1446  if (!m_safeFactorization && hasZeroPivots(m_B.begin(), m_B.begin() + n_eff * (2*m_k+1), m_k, (PrecValueType) BURST_VALUE))
1447  throw system_error(system_error::Zero_pivoting, "Found a pivot equal to zero (partBandedLU_const).");
1448 
1449 
1450  if (m_numPartitions == 1) {
1451  int gridX = m_n;
1452  int gridY = 1;
1453  kernelConfigAdjust(gridX, gridY, MAX_GRID_DIMENSION);
1454  dim3 grids(gridX, gridY);
1455  if (m_k > 1024)
1456  device::bandLU_post_divide_general<PrecValueType><<<grids, 512>>>(dB, m_k, m_n);
1457  else
1458  device::bandLU_post_divide<PrecValueType><<<grids, m_k>>>(dB, m_k, m_n);
1459  }
1460 }
1461 
1462 
1463 template <typename PrecVector>
1464 void
1465 Precond<PrecVector>::partBandedLU_var()
1466 {
1467  // Note that this function can only be called if there are two or more partitions.
1468  // Also, in this case, the factorization method is LU_only which implies that all
1469  // partitions are LU factorized.
1470 
1471  PrecValueType* dB = thrust::raw_pointer_cast(&m_B[0]);
1472  int* p_ks = thrust::raw_pointer_cast(&m_ks[0]);
1473  int* p_BOffsets = thrust::raw_pointer_cast(&m_BOffsets[0]);
1474 
1475  int tmp_k = cusp::blas::nrmmax(m_ks);
1476  int partSize = m_n / m_numPartitions;
1477  int remainder = m_n % m_numPartitions;
1478 
1479  if(tmp_k >= CRITICAL_THRESHOLD) {
1480  int final_partition_size = partSize + 1;
1481  int blockY = 0;
1482  int threadsNum = adjustNumThreads(cusp::blas::nrm1(m_ks) / m_numPartitions);
1483  int last = 0;
1484 
1485  for (int st_row = 0; st_row < final_partition_size - 1; st_row++) {
1486  if (st_row == 0) {
1487  if (remainder == 0) continue;
1488 
1489  blockY = m_ks_row_host[st_row];
1490  last = m_ks_col_host[st_row];
1491  int corres_row = st_row;
1492  for (int i = 1; i < remainder; i++) {
1493  corres_row += partSize + 1;
1494  if (blockY < m_ks_row_host[corres_row])
1495  blockY = m_ks_row_host[corres_row];
1496  if (last < m_ks_col_host[corres_row])
1497  last = m_ks_col_host[corres_row];
1498  }
1499 
1500  if (m_safeFactorization)
1501  device::var::bandLU_critical_div_safe_general<PrecValueType><<<remainder, threadsNum>>>(dB, st_row, p_ks, p_BOffsets, partSize, remainder);
1502  else
1503  device::var::bandLU_critical_div_general<PrecValueType><<<remainder, threadsNum>>>(dB, st_row, p_ks, p_BOffsets, partSize, remainder);
1504 
1505  dim3 tmpGrids(blockY, remainder);
1506  device::var::bandLU_critical_sub_general<PrecValueType><<<tmpGrids, threadsNum>>>(dB, st_row, p_ks, p_BOffsets, partSize, remainder, last);
1507  } else {
1508  blockY = 0;
1509  last = 0;
1510  int corres_row = st_row;
1511  for (int i = 0; i < remainder; i++) {
1512  if (blockY < m_ks_row_host[corres_row])
1513  blockY = m_ks_row_host[corres_row];
1514  if (last < m_ks_col_host[corres_row])
1515  last = m_ks_col_host[corres_row];
1516  corres_row += partSize + 1;
1517  }
1518  corres_row --;
1519  for (int i = remainder; i < m_numPartitions; i++) {
1520  if (blockY < m_ks_row_host[corres_row])
1521  blockY = m_ks_row_host[corres_row];
1522  if (last < m_ks_col_host[corres_row])
1523  last = m_ks_col_host[corres_row];
1524  corres_row += partSize;
1525  }
1526 
1527  if (m_safeFactorization)
1528  device::var::bandLU_critical_div_safe_general<PrecValueType><<<m_numPartitions, threadsNum>>>(dB, st_row, p_ks, p_BOffsets, partSize, remainder);
1529  else
1530  device::var::bandLU_critical_div_general<PrecValueType><<<m_numPartitions, threadsNum>>>(dB, st_row, p_ks, p_BOffsets, partSize, remainder);
1531 
1532  dim3 tmpGrids(blockY, m_numPartitions);
1533  device::var::bandLU_critical_sub_general<PrecValueType><<<tmpGrids, threadsNum>>>(dB, st_row, p_ks, p_BOffsets, partSize, remainder, last);
1534  }
1535  }
1536  } else if (tmp_k > 27){
1537  if (m_safeFactorization)
1538  device::var::bandLU_g32_safe<PrecValueType><<<m_numPartitions, 512>>>(dB, p_ks, p_BOffsets, partSize, remainder);
1539  else
1540  device::var::bandLU_g32<PrecValueType><<<m_numPartitions, 512>>>(dB, p_ks, p_BOffsets, partSize, remainder);
1541  } else {
1542  if (m_safeFactorization)
1543  device::var::bandLU_safe<PrecValueType><<<m_numPartitions, tmp_k * tmp_k >>>(dB, p_ks, p_BOffsets, partSize, remainder);
1544  else
1545  device::var::bandLU<PrecValueType><<<m_numPartitions, tmp_k * tmp_k>>>(dB, p_ks, p_BOffsets, partSize, remainder);
1546  }
1547 
1548 
1549  if (m_safeFactorization)
1550  device::var::boostLastPivot<PrecValueType><<<m_numPartitions, 1>>>(dB, partSize, p_ks, p_BOffsets, partSize, remainder);
1551 
1552 
1553  // If not using safe factorization, check for zero pivots in the factorized banded
1554  // matrix, one partition at a time.
1555  if (!m_safeFactorization) {
1556  for (int i = 0; i < m_numPartitions; i++) {
1557  if (hasZeroPivots(m_B.begin() + m_BOffsets_host[i], m_B.begin() + m_BOffsets_host[i+1], m_ks_host[i], (PrecValueType) BURST_VALUE))
1558  throw system_error(system_error::Zero_pivoting, "Found a pivot equal to zero (partBandedLU_var).");
1559  }
1560  }
1561 
1562 
1563  int gridX = partSize+1;
1564  int gridY = 1;
1565  kernelConfigAdjust(gridX, gridY, MAX_GRID_DIMENSION);
1566  dim3 grids(gridX, gridY);
1567 
1568  for (int i=0; i<m_numPartitions ; i++) {
1569  if (i < remainder) {
1570  if (m_ks_host[i] <= 1024)
1571  device::var::bandLU_post_divide_per_partition<PrecValueType><<<grids, m_ks_host[i]>>>(dB, m_ks_host[i], m_BOffsets_host[i], partSize + 1);
1572  else
1573  device::var::bandLU_post_divide_per_partition_general<PrecValueType><<<grids, 512>>>(dB, m_ks_host[i], m_BOffsets_host[i], partSize + 1);
1574  }
1575  else {
1576  if (m_ks_host[i] <= 1024)
1577  device::var::bandLU_post_divide_per_partition<PrecValueType><<<grids, m_ks_host[i]>>>(dB, m_ks_host[i], m_BOffsets_host[i], partSize);
1578  else
1579  device::var::bandLU_post_divide_per_partition_general<PrecValueType><<<grids, 512>>>(dB, m_ks_host[i], m_BOffsets_host[i], partSize);
1580  }
1581  }
1582 }
1583 
1589 template <typename PrecVector>
1590 void
1591 Precond<PrecVector>::partBandedUL(PrecVector& B)
1592 {
1593  // Note that this function can only be called if using the constant band
1594  // method and there are two or more partitions.
1595  // In any other situation, we use LU only factorization.
1596  // This means that the diagonal block for the first partition is never
1597  // UL factorized.
1598 
1599 
1600  int partSize = m_n / m_numPartitions;
1601  int remainder = m_n % m_numPartitions;
1602  int n_first = (remainder == 0 ? partSize : (partSize + 1));
1603 
1604  PrecValueType* dB = thrust::raw_pointer_cast(&B[(2 * m_k + 1) * n_first]);
1605 
1606  int n_eff = m_n - n_first;
1607  int numPart_eff = m_numPartitions - 1;
1608 
1609  partSize = n_eff / numPart_eff;
1610  remainder = n_eff % numPart_eff;
1611 
1612  if(m_k >= CRITICAL_THRESHOLD) {
1613  int n_final = partSize + 1;
1614  int threadsNum = 0;
1615  for (int st_row = n_final - 1; st_row > 0; st_row--) {
1616  if (st_row == n_final - 1) {
1617  if (remainder == 0) continue;
1618  threadsNum = m_k;
1619  if(st_row < m_k)
1620  threadsNum = st_row;
1621  dim3 tmpGrids(threadsNum, remainder);
1622  if (threadsNum > 1024) {
1623  if (m_safeFactorization)
1624  device::bandUL_critical_div_safe_general<PrecValueType><<<remainder, 512>>>(dB, st_row, m_k, partSize, remainder);
1625  else
1626  device::bandUL_critical_div_general<PrecValueType><<<remainder, 512>>>(dB, st_row, m_k, partSize, remainder);
1627  device::bandUL_critical_sub_general<PrecValueType><<<tmpGrids, 512>>>(dB, st_row, m_k, partSize, remainder);
1628  } else {
1629  if (m_safeFactorization)
1630  device::bandUL_critical_div_safe<PrecValueType><<<remainder, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1631  else
1632  device::bandUL_critical_div<PrecValueType><<<remainder, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1633  device::bandUL_critical_sub<PrecValueType><<<tmpGrids, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1634  }
1635  } else {
1636  threadsNum = m_k;
1637  if(st_row < m_k)
1638  threadsNum = st_row;
1639  dim3 tmpGrids(threadsNum, numPart_eff);
1640  if(threadsNum > 1024) {
1641  if (m_safeFactorization)
1642  device::bandUL_critical_div_safe_general<PrecValueType> <<<numPart_eff, 512>>>(dB, st_row, m_k, partSize, remainder);
1643  else
1644  device::bandUL_critical_div_general<PrecValueType> <<<numPart_eff, 512>>>(dB, st_row, m_k, partSize, remainder);
1645  device::bandUL_critical_sub_general<PrecValueType> <<<tmpGrids, 512>>>(dB, st_row, m_k, partSize, remainder);
1646  } else {
1647  if (m_safeFactorization)
1648  device::bandUL_critical_div_safe<PrecValueType> <<<numPart_eff, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1649  else
1650  device::bandUL_critical_div<PrecValueType> <<<numPart_eff, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1651  device::bandUL_critical_sub<PrecValueType> <<<tmpGrids, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1652  }
1653  }
1654  }
1655  } else if (m_k > 27) {
1656  if (m_safeFactorization)
1657  device::bandUL_g32_safe<PrecValueType><<<numPart_eff, 512>>>(dB, m_k, partSize, remainder);
1658  else
1659  device::bandUL_g32<PrecValueType><<<numPart_eff, 512>>>(dB, m_k, partSize, remainder);
1660  } else {
1661  if (m_safeFactorization)
1662  device::bandUL_safe<PrecValueType><<<numPart_eff, m_k * m_k>>>(dB, m_k, partSize, remainder);
1663  else
1664  device::bandUL<PrecValueType><<<numPart_eff, m_k * m_k>>>(dB, m_k, partSize, remainder);
1666  }
1667 
1668 
1669  // If not using safe factorization, check for zero pivots in the factorized
1670  // banded matrix.
1671  if (!m_safeFactorization && hasZeroPivots(B.begin() + (2 * m_k + 1) * n_first, B.end(), m_k, (PrecValueType) BURST_VALUE))
1672  throw system_error(system_error::Zero_pivoting, "Found a pivot equal to zero (partBandedUL).");
1673 }
1674 
1675 
1682 template <typename PrecVector>
1683 void
1684 Precond<PrecVector>::partBandedFwdSweep(PrecVector& v)
1685 {
1686  if (!m_variableBandwidth)
1687  partBandedFwdSweep_const(v);
1688  else
1689  partBandedFwdSweep_var(v);
1690 }
1691 
1692 template <typename PrecVector>
1693 void
1694 Precond<PrecVector>::partBandedFwdSweep_const(PrecVector& v)
1695 {
1696  PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[0]);
1697  PrecValueType* p_v = thrust::raw_pointer_cast(&v[0]);
1698 
1699  int partSize = m_n / m_numPartitions;
1700  int remainder = m_n % m_numPartitions;
1701 
1702  if (m_precondType == Block || m_factMethod == LU_only || m_numPartitions == 1) {
1703  if (m_k > 1024)
1704  device::forwardElimL_general<PrecValueType><<<m_numPartitions, 512>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1705  else if (m_k > 32)
1706  device::forwardElimL_g32<PrecValueType><<<m_numPartitions, m_k>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1707  else
1708  device::forwardElimL<PrecValueType><<<m_numPartitions, m_k>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1709  } else {
1710  if (m_k > 1024)
1711  device::forwardElimL_LU_UL_general<PrecValueType><<<m_numPartitions, 512>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1712  else if (m_k > 32)
1713  device::forwardElimL_LU_UL_g32<PrecValueType><<<m_numPartitions, m_k>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1714  else
1715  device::forwardElimL_LU_UL<PrecValueType><<<m_numPartitions, m_k>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1716  }
1717 }
1718 
1719 template <typename PrecVector>
1720 void
1721 Precond<PrecVector>::partBandedFwdSweep_var(PrecVector& v)
1722 {
1723  PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[0]);
1724  PrecValueType* p_v = thrust::raw_pointer_cast(&v[0]);
1725  int* p_ks = thrust::raw_pointer_cast(&m_ks[0]);
1726  int* p_BOffsets = thrust::raw_pointer_cast(&m_BOffsets[0]);
1727 
1728  int tmp_k = cusp::blas::nrmmax(m_ks);
1729  int partSize = m_n / m_numPartitions;
1730  int remainder = m_n % m_numPartitions;
1731 
1732  if (tmp_k > 1024)
1733  device::var::fwdElim_sol<PrecValueType><<<m_numPartitions, 512>>>(m_n, p_ks, p_BOffsets, p_B, p_v, partSize, remainder);
1734  else if (tmp_k > 32)
1735  device::var::fwdElim_sol_medium<PrecValueType><<<m_numPartitions, tmp_k>>>(m_n, p_ks, p_BOffsets, p_B, p_v, partSize, remainder);
1736  else
1737  device::var::fwdElim_sol_narrow<PrecValueType><<<m_numPartitions, tmp_k>>>(m_n, p_ks, p_BOffsets, p_B, p_v, partSize, remainder);
1738 }
1739 
1746 template <typename PrecVector>
1747 void
1748 Precond<PrecVector>::partBandedBckSweep(PrecVector& v)
1749 {
1750  if (!m_variableBandwidth)
1751  partBandedBckSweep_const(v);
1752  else
1753  partBandedBckSweep_var(v);
1754 }
1755 
1756 template <typename PrecVector>
1757 void
1758 Precond<PrecVector>::partBandedBckSweep_const(PrecVector& v)
1759 {
1760  PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[0]);
1761  PrecValueType* p_v = thrust::raw_pointer_cast(&v[0]);
1762 
1763  int partSize = m_n / m_numPartitions;
1764  int remainder = m_n % m_numPartitions;
1765 
1766  if (m_precondType == Block || m_factMethod == LU_only || m_numPartitions == 1) {
1767  if (m_numPartitions > 1) {
1768  if (m_k > 1024)
1769  device::backwardElimU_general<PrecValueType><<<m_numPartitions, 512>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1770  else if (m_k > 32)
1771  device::backwardElimU_g32<PrecValueType><<<m_numPartitions, m_k>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1772  else
1773  device::backwardElimU<PrecValueType><<<m_numPartitions, m_k>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1774  } else {
1775  int gridX = 1;
1776  int blockX = m_n;
1777  if (blockX > BLOCK_SIZE) {
1778  gridX = (blockX + BLOCK_SIZE - 1) / BLOCK_SIZE;
1779  blockX = BLOCK_SIZE;
1780  }
1781  dim3 grids(gridX, m_numPartitions);
1782 
1783  device::preBck_sol_divide<PrecValueType><<<grids, blockX>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1784 
1785  if (m_k > 1024)
1786  device::bckElim_sol<PrecValueType><<<m_numPartitions, 512>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1787  else if (m_k > 32)
1788  device::bckElim_sol_medium<PrecValueType><<<m_numPartitions, m_k>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1789  else
1790  device::bckElim_sol_narrow<PrecValueType><<<m_numPartitions, m_k>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1791  }
1792  } else {
1793  if (m_k > 1024)
1794  device::backwardElimU_LU_UL_general<PrecValueType><<<m_numPartitions, 512>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1795  else if (m_k > 32)
1796  device::backwardElimU_LU_UL_g32<PrecValueType><<<m_numPartitions, m_k>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1797  else
1798  device::backwardElimU_LU_UL<PrecValueType><<<m_numPartitions, m_k>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1799  }
1800 }
1801 
1802 template <typename PrecVector>
1803 void
1804 Precond<PrecVector>::partBandedBckSweep_var(PrecVector& v)
1805 {
1806  PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[0]);
1807  PrecValueType* p_v = thrust::raw_pointer_cast(&v[0]);
1808  int* p_ks = thrust::raw_pointer_cast(&m_ks[0]);
1809  int* p_BOffsets = thrust::raw_pointer_cast(&m_BOffsets[0]);
1810 
1811  int tmp_k = cusp::blas::nrmmax(m_ks);
1812  int partSize = m_n / m_numPartitions;
1813  int remainder = m_n % m_numPartitions;
1814 
1815  int gridX = 1, blockX = partSize + 1;
1816  kernelConfigAdjust(blockX, gridX, BLOCK_SIZE);
1817  dim3 grids(gridX, m_numPartitions);
1818  device::var::preBck_sol_divide<PrecValueType><<<grids, blockX>>>(m_n, p_ks, p_BOffsets, p_B, p_v, partSize, remainder);
1819 
1820  if (tmp_k > 1024)
1821  device::var::bckElim_sol<PrecValueType><<<m_numPartitions, 512>>>(m_n, p_ks, p_BOffsets, p_B, p_v, partSize, remainder);
1822  else if (tmp_k > 32)
1823  device::var::bckElim_sol_medium<PrecValueType><<<m_numPartitions, tmp_k>>>(m_n, p_ks, p_BOffsets, p_B, p_v, partSize, remainder);
1824  else
1825  device::var::bckElim_sol_narrow<PrecValueType><<<m_numPartitions, tmp_k>>>(m_n, p_ks, p_BOffsets, p_B, p_v, partSize, remainder);
1826 }
1827 
1832 template <typename PrecVector>
1833 void
1834 Precond<PrecVector>::partFullFwdSweep(PrecVector& v)
1835 {
1836  PrecValueType* p_R = thrust::raw_pointer_cast(&m_R[0]);
1837  PrecValueType* p_v = thrust::raw_pointer_cast(&v[0]);
1838 
1839  int partSize = m_n / m_numPartitions;
1840  int remainder = m_n % m_numPartitions;
1841 
1842  dim3 grids(m_numPartitions-1, 1);
1843 
1844  if (!m_variableBandwidth) {
1845  if (m_k > 512)
1846  device::forwardElimLNormal_g512<PrecValueType><<<grids, 512>>>(m_n, m_k, 2*m_k, p_R, p_v, partSize, remainder);
1847  else
1848  device::forwardElimLNormal<PrecValueType><<<grids, 2*m_k-1>>>(m_n, m_k, 2*m_k, p_R, p_v, partSize, remainder);
1849  } else {
1850  int* p_ROffsets = thrust::raw_pointer_cast(&m_ROffsets[0]);
1851  int* p_spike_ks = thrust::raw_pointer_cast(&m_spike_ks[0]);
1852 
1853  if (m_k > 512)
1854  device::var::fwdElim_full<PrecValueType><<<grids, 512>>>(m_n, p_spike_ks, p_ROffsets, p_R, p_v, partSize, remainder);
1855  else
1856  device::var::fwdElim_full_narrow<PrecValueType><<<grids, m_k>>>(m_n, p_spike_ks, p_ROffsets, p_R, p_v, partSize, remainder);
1857  }
1858 }
1859 
1860 
1865 template <typename PrecVector>
1866 void
1867 Precond<PrecVector>::partFullBckSweep(PrecVector& v)
1868 {
1869  PrecValueType* p_R = thrust::raw_pointer_cast(&m_R[0]);
1870  PrecValueType* p_v = thrust::raw_pointer_cast(&v[0]);
1871 
1872  int partSize = m_n / m_numPartitions;
1873  int remainder = m_n % m_numPartitions;
1874 
1875  dim3 grids(m_numPartitions-1, 1);
1876 
1877  if (!m_variableBandwidth) {
1878  if (m_k > 512)
1879  device::backwardElimUNormal_g512<PrecValueType><<<grids, 512>>>(m_n, m_k, 2*m_k, p_R, p_v, partSize, remainder);
1880  else
1881  device::backwardElimUNormal<PrecValueType><<<grids, 2*m_k-1>>>(m_n, m_k, 2*m_k, p_R, p_v, partSize, remainder);
1882  } else {
1883  int* p_ROffsets = thrust::raw_pointer_cast(&m_ROffsets[0]);
1884  int* p_spike_ks = thrust::raw_pointer_cast(&m_spike_ks[0]);
1885 
1886  if (m_k > 512) {
1887  device::var::preBck_full_divide<PrecValueType><<<m_numPartitions-1, 512>>>(m_n, p_spike_ks, p_ROffsets, p_R, p_v, partSize, remainder);
1888  device::var::bckElim_full<PrecValueType><<<grids, 512>>>(m_n, p_spike_ks, p_ROffsets, p_R, p_v, partSize, remainder);
1889  }
1890  else {
1891  device::var::preBck_full_divide_narrow<PrecValueType><<<m_numPartitions-1, m_k>>>(m_n, p_spike_ks, p_ROffsets, p_R, p_v, partSize, remainder);
1892  device::var::bckElim_full_narrow<PrecValueType><<<grids, 2*m_k-1>>>(m_n, p_spike_ks, p_ROffsets, p_R, p_v, partSize, remainder);
1893  }
1894  }
1895 }
1896 
1902 template <typename PrecVector>
1903 void
1904 Precond<PrecVector>::purifyRHS(PrecVector& v,
1905  PrecVector& res)
1906 {
1907  PrecValueType* p_offDiags = thrust::raw_pointer_cast(&m_offDiags[0]);
1908  PrecValueType* p_v = thrust::raw_pointer_cast(&v[0]);
1909  PrecValueType* p_res = thrust::raw_pointer_cast(&res[0]);
1910 
1911  int partSize = m_n / m_numPartitions;
1912  int remainder = m_n % m_numPartitions;
1913 
1914  dim3 grids(m_k, m_numPartitions-1);
1915 
1916  if (!m_variableBandwidth) {
1917  if (m_k > 256)
1918  device::innerProductBCX_g256<PrecValueType><<<grids, 256>>>(p_offDiags, p_v, p_res, m_n, m_k, partSize, m_numPartitions, remainder);
1919  else if (m_k > 64)
1920  device::innerProductBCX_g64<PrecValueType><<<grids, 256>>>(p_offDiags, p_v, p_res, m_n, m_k, partSize, m_numPartitions, remainder);
1921  else if (m_k > 32)
1922  device::innerProductBCX_g32<PrecValueType><<<grids, 64>>>(p_offDiags, p_v, p_res, m_n, m_k, partSize, m_numPartitions, remainder);
1923  else
1924  device::innerProductBCX<PrecValueType><<<grids, 32>>>(p_offDiags, p_v, p_res, m_n, m_k, partSize, m_numPartitions, remainder);
1925  } else {
1926  int* p_WVOffsets = thrust::raw_pointer_cast(&m_WVOffsets[0]);
1927  int* p_spike_ks = thrust::raw_pointer_cast(&m_spike_ks[0]);
1928 
1929  if (m_k > 256)
1930  device::innerProductBCX_var_bandwidth_g256<PrecValueType><<<grids, 256>>>(p_offDiags, p_v, p_res, m_n, p_spike_ks, p_WVOffsets, partSize, m_numPartitions, remainder);
1931  else if (m_k > 64)
1932  device::innerProductBCX_var_bandwidth_g64<PrecValueType><<<grids, 256>>>(p_offDiags, p_v, p_res, m_n, p_spike_ks, p_WVOffsets, partSize, m_numPartitions, remainder);
1933  else if (m_k > 32)
1934  device::innerProductBCX_var_bandwidth_g32<PrecValueType><<<grids, 64>>>(p_offDiags, p_v, p_res, m_n, p_spike_ks, p_WVOffsets, partSize, m_numPartitions, remainder);
1935  else
1936  device::innerProductBCX_var_bandwidth<PrecValueType><<<grids, 32>>>(p_offDiags, p_v, p_res, m_n, p_spike_ks, p_WVOffsets, partSize, m_numPartitions, remainder);
1937  }
1938 }
1939 
1945 template <typename PrecVector>
1946 void
1947 Precond<PrecVector>::calculateSpikes(PrecVector& WV)
1948 {
1949  if (!m_variableBandwidth) {
1950  calculateSpikes_const(WV);
1951  return;
1952  }
1953 
1954  int totalRHSCount = cusp::blas::nrm1(m_offDiagWidths_right_host) + cusp::blas::nrm1(m_offDiagWidths_left_host);
1955  if (totalRHSCount >= 2800) {
1956  calculateSpikes_var(WV);
1957  return;
1958  }
1959 
1960  calculateSpikes_var_old(WV);
1961 }
1962 
1963 template <typename PrecVector>
1964 void
1965 Precond<PrecVector>::calculateSpikes_var_old(PrecVector& WV)
1966 {
1967  PrecVector WV_spare(m_k*m_k);
1968 
1969  PrecValueType* p_WV = thrust::raw_pointer_cast(&WV[0]);
1970  PrecValueType* p_WV_spare = thrust::raw_pointer_cast(&WV_spare[0]);
1971 
1972  // Calculate the size of the first and last partitions.
1973  int last_partition_size = m_n / m_numPartitions;
1974  int first_partition_size = last_partition_size;
1975  int numThreadsToUse = adjustNumThreads(cusp::blas::nrm1(m_ks) / m_numPartitions);
1976 
1977  if (m_n % m_numPartitions != 0)
1978  first_partition_size++;
1979 
1980 
1981  // Copy WV into extV, perform sweeps to calculate extV, then copy back extV to WV.
1982  // Note that we skip the last partition (no right spike associated with it). Also
1983  // note that we only perform truncated spikes using the bottom parts of the L and
1984  // U factors to calculate the bottom block of the right spikes V.
1985  {
1986  int n_eff = m_n - last_partition_size;
1987  int numPart_eff = m_numPartitions - 1;
1988  int partSize = n_eff / numPart_eff;
1989  int remainder = n_eff % numPart_eff;
1990 
1991  const int BUF_FACTOR = 16;
1992 
1993  PrecVector extV(m_k * n_eff, (PrecValueType)0), buffer;
1994 
1995  PrecValueType* p_extV = thrust::raw_pointer_cast(&extV[0]);
1996  PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[0]);
1997  int* p_secondReordering = thrust::raw_pointer_cast(&m_secondReordering[0]);
1998  int* p_secondPerm = thrust::raw_pointer_cast(&m_secondPerm[0]);
1999 
2000  dim3 gridsCopy(m_k, numPart_eff);
2001  dim3 gridsSweep(numPart_eff, m_k);
2002 
2003  int* p_ks = thrust::raw_pointer_cast(&m_ks[0]);
2004  int* p_offDiagWidths_right = thrust::raw_pointer_cast(&m_offDiagWidths_right[0]);
2005  int* p_offDiagPerms_right = thrust::raw_pointer_cast(&m_offDiagPerms_right[0]);
2006  int* p_first_rows = thrust::raw_pointer_cast(&m_first_rows[0]);
2007  int* p_offsets = thrust::raw_pointer_cast(&m_BOffsets[0]);
2008 
2009  int permuteBlockX = n_eff;
2010  int permuteGridX = 1;
2011  kernelConfigAdjust(permuteBlockX, permuteGridX, BLOCK_SIZE);
2012  dim3 gridsPermute(permuteGridX, m_k);
2013 
2014  {
2015  device::copyWVFromOrToExtendedV_general<PrecValueType><<<gridsCopy, numThreadsToUse>>>(n_eff, m_k, partSize, remainder, p_WV, p_extV, false);
2016  buffer.resize((m_k - (BUF_FACTOR-1) * (m_k / BUF_FACTOR)) * n_eff);
2017 
2018  PrecValueType* p_buffer = thrust::raw_pointer_cast(&buffer[0]);
2019 
2020  for (int i=0; i<BUF_FACTOR; i++) {
2021  gridsPermute.y = m_k / BUF_FACTOR;
2022  if (i == BUF_FACTOR - 1)
2023  gridsPermute.y = m_k - (BUF_FACTOR-1) * (m_k/BUF_FACTOR);
2024  device::permute<PrecValueType><<<gridsPermute, permuteBlockX>>>(n_eff, p_extV+(i*(m_k/BUF_FACTOR)*n_eff), p_buffer, p_secondPerm);
2025  thrust::copy(buffer.begin(), buffer.begin()+(gridsPermute.y * n_eff), extV.begin()+i*(m_k/BUF_FACTOR)*n_eff);
2026  }
2027 
2028  {
2029  int last_row = 0, pseudo_first_row = 0;
2030  for (int i=0; i<numPart_eff; i++) {
2031  if (i < remainder)
2032  last_row += partSize + 1;
2033  else
2034  last_row += partSize;
2035 
2036  int tmp_first_row = m_first_rows_host[i];
2037  device::var::fwdElim_rightSpike_per_partition<PrecValueType><<<m_offDiagWidths_right_host[i], numThreadsToUse>>> (n_eff, m_ks_host[i], m_BOffsets_host[i]+m_ks_host[i]+(2*m_ks_host[i]+1)*(m_first_rows_host[i]-pseudo_first_row), p_B, p_extV, m_first_rows_host[i], last_row);
2038 
2039  int blockX = last_row - m_first_rows_host[i];
2040  int gridX = 1;
2041  kernelConfigAdjust(blockX, gridX, BLOCK_SIZE);
2042  dim3 grids(gridX, m_offDiagWidths_right_host[i]);
2043  device::var::preBck_rightSpike_divide_per_partition<PrecValueType><<<grids, blockX>>> (n_eff, m_ks_host[i], m_BOffsets_host[i]+m_ks_host[i]+(2*m_ks_host[i]+1)*(m_first_rows_host[i]-pseudo_first_row), p_B, p_extV, m_first_rows_host[i], last_row);
2044 
2045  m_first_rows_host[i] = thrust::reduce(m_secondPerm.begin()+(last_row-m_k), m_secondPerm.begin()+last_row, last_row, thrust::minimum<int>());
2046  device::var::bckElim_rightSpike_per_partition<PrecValueType><<<m_offDiagWidths_right_host[i], numThreadsToUse>>> (n_eff, m_ks_host[i], m_BOffsets_host[i]+m_ks_host[i]+(2*m_ks_host[i]+1)*(last_row-pseudo_first_row-1), p_B, p_extV, m_first_rows_host[i], last_row);
2047 
2048  pseudo_first_row = last_row;
2049  }
2050  }
2051 
2052  for (int i=0; i<BUF_FACTOR; i++) {
2053  gridsPermute.y = m_k / BUF_FACTOR;
2054  if (i == BUF_FACTOR - 1)
2055  gridsPermute.y = m_k - (BUF_FACTOR-1) * (m_k/BUF_FACTOR);
2056  device::permute<PrecValueType><<<gridsPermute, permuteBlockX>>>(n_eff, p_extV+(i*(m_k/BUF_FACTOR)*n_eff), p_buffer, p_secondReordering);
2057  thrust::copy(buffer.begin(), buffer.begin()+(gridsPermute.y * n_eff), extV.begin()+i*(m_k/BUF_FACTOR)*n_eff);
2058  }
2059 
2060  device::copyWVFromOrToExtendedV_general<PrecValueType><<<gridsCopy, numThreadsToUse>>>(n_eff, m_k, partSize, remainder, p_WV, p_extV, true);
2061  }
2062  for (int i=0; i<numPart_eff; i++) {
2063  cusp::blas::fill(WV_spare, (PrecValueType) 0);
2064  device::matrixVReordering_perPartition<PrecValueType><<<m_offDiagWidths_right_host[i], numThreadsToUse>>>(m_k, p_WV+2*i*m_k*m_k, p_WV_spare, p_offDiagPerms_right+i*m_k);
2065  thrust::copy(WV_spare.begin(), WV_spare.end(), WV.begin() + (2*i*m_k*m_k));
2066  }
2067  }
2068 
2069 
2070  // Copy WV into extW, perform sweeps to calculate extW, then copy back extW to WV.
2071  // Note that we skip the first partition (no left spike associated with it). Also
2072  // note that we perform full sweeps using the L and U factors to calculate the
2073  // entire left spikes W.
2074  {
2075  int n_eff = m_n - first_partition_size;
2076  int numPart_eff = m_numPartitions - 1;
2077  int partSize = n_eff / numPart_eff;
2078  int remainder = n_eff % numPart_eff;
2079 
2080  const int BUF_FACTOR = 16;
2081 
2082  PrecVector extW(m_k * n_eff, (PrecValueType)0), buffer;
2083 
2084  PrecValueType* p_extW = thrust::raw_pointer_cast(&extW[0]);
2085  PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[0]);
2086 
2087  dim3 gridsSweep(numPart_eff, m_k);
2088  dim3 gridsCopy(m_k, numPart_eff);
2089 
2090  int* p_ks = thrust::raw_pointer_cast(&m_ks[1]);
2091  int* p_offDiagWidths_left = thrust::raw_pointer_cast(&m_offDiagWidths_left[0]);
2092  int* p_offDiagPerms_left = thrust::raw_pointer_cast(&m_offDiagPerms_left[0]);
2093 
2094  IntVector tmp_offsets;
2095  IntVector tmp_secondReordering(m_n, first_partition_size);
2096  IntVector tmp_secondPerm(m_n, first_partition_size);
2097 
2098  cusp::blas::axpby(m_secondReordering, tmp_secondReordering, tmp_secondReordering, 1.0, -1.0);
2099  cusp::blas::axpby(m_secondPerm, tmp_secondPerm, tmp_secondPerm, 1.0, -1.0);
2100 
2101  int* p_secondReordering = thrust::raw_pointer_cast(&tmp_secondReordering[first_partition_size]);
2102  int* p_secondPerm = thrust::raw_pointer_cast(&tmp_secondPerm[first_partition_size]);
2103 
2104  {
2105  IntVectorH tmp_offsets_host = m_BOffsets;
2106  for (int i = m_numPartitions-1; i >= 1; i--)
2107  tmp_offsets_host[i] -= tmp_offsets_host[1];
2108  tmp_offsets = tmp_offsets_host;
2109  }
2110 
2111  int* p_offsets = thrust::raw_pointer_cast(&tmp_offsets[1]);
2112 
2113  int permuteBlockX = n_eff;
2114  int permuteGridX = 1;
2115  kernelConfigAdjust(permuteBlockX, permuteGridX, BLOCK_SIZE);
2116  dim3 gridsPermute(permuteGridX, m_k);
2117 
2118  {
2119  device::copyWVFromOrToExtendedW_general<PrecValueType><<<gridsCopy, numThreadsToUse>>>(n_eff, m_k, partSize, remainder, p_WV, p_extW, false);
2120  buffer.resize((m_k - (BUF_FACTOR-1) * (m_k / BUF_FACTOR)) * n_eff);
2121  PrecValueType* p_buffer = thrust::raw_pointer_cast(&buffer[0]);
2122 
2123  for (int i = 0; i < BUF_FACTOR; i++) {
2124  gridsPermute.y = m_k / BUF_FACTOR;
2125  if (i == BUF_FACTOR - 1)
2126  gridsPermute.y = m_k - (BUF_FACTOR-1) * (m_k/BUF_FACTOR);
2127  device::permute<PrecValueType><<<gridsPermute, permuteBlockX>>>(n_eff, p_extW+i*(m_k/BUF_FACTOR)*n_eff, p_buffer, p_secondPerm);
2128  thrust::copy(buffer.begin(), buffer.begin()+(gridsPermute.y * n_eff), extW.begin()+i*(m_k/BUF_FACTOR)*n_eff);
2129  }
2130 
2131  {
2132  int last_row = 0;
2133  int first_row = 0;
2134  for (int i = 0; i < numPart_eff; i++) {
2135  if (i < remainder)
2136  last_row += partSize + 1;
2137  else
2138  last_row += partSize;
2139  device::var::fwdElim_leftSpike_per_partition<PrecValueType><<<m_offDiagWidths_left_host[i], numThreadsToUse>>> (n_eff, m_ks_host[i+1], m_k - m_offDiagWidths_left_host[i], m_BOffsets_host[i+1]+m_ks_host[i+1], p_B, p_extW, first_row, last_row);
2140 
2141  int blockX = last_row - first_row;
2142  int gridX = 1;
2143  kernelConfigAdjust(blockX, gridX, BLOCK_SIZE);
2144  dim3 grids(gridX, m_offDiagWidths_left_host[i]);
2145 
2146  device::var::preBck_leftSpike_divide_per_partition<PrecValueType><<<grids, blockX>>> (n_eff, m_ks_host[i+1], m_k - m_offDiagWidths_left_host[i], m_BOffsets_host[i+1]+m_ks_host[i+1], p_B, p_extW, first_row, last_row);
2147  device::var::bckElim_leftSpike_per_partition<PrecValueType><<<m_offDiagWidths_left_host[i], numThreadsToUse>>>(n_eff, m_ks_host[i+1], m_k - m_offDiagWidths_left_host[i], m_BOffsets_host[i+1] + m_ks_host[i+1] + (2*m_ks_host[i+1]+1)*(last_row-first_row-1), p_B, p_extW, first_row, last_row);
2148  first_row = last_row;
2149  }
2150  }
2151 
2152  for (int i = 0; i < BUF_FACTOR; i++) {
2153  gridsPermute.y = m_k / BUF_FACTOR;
2154  if (i == BUF_FACTOR - 1)
2155  gridsPermute.y = m_k - (BUF_FACTOR-1) * (m_k/BUF_FACTOR);
2156 
2157  device::permute<PrecValueType><<<gridsPermute, permuteBlockX>>>(n_eff, p_extW+i*(m_k/BUF_FACTOR)*n_eff, p_buffer, p_secondReordering);
2158  thrust::copy(buffer.begin(), buffer.begin()+(gridsPermute.y * n_eff), extW.begin()+i*(m_k/BUF_FACTOR)*n_eff);
2159  }
2160 
2161  device::copyWVFromOrToExtendedW_general<PrecValueType><<<gridsCopy, numThreadsToUse>>>(n_eff, m_k, partSize, remainder, p_WV, p_extW, true);
2162  }
2163 
2164  for (int i = 0; i < numPart_eff; i++) {
2165  cusp::blas::fill(WV_spare, (PrecValueType) 0);
2166  device::matrixWReordering_perPartition<PrecValueType><<<m_offDiagWidths_left_host[i], numThreadsToUse>>>(m_k, p_WV+(2*i+1)*m_k*m_k, p_WV_spare, p_offDiagPerms_left+i*m_k);
2167  thrust::copy(WV_spare.begin(), WV_spare.end(), WV.begin() + ((2*i+1)*m_k*m_k));
2168  }
2169  }
2170 }
2171 
2172 template <typename PrecVector>
2173 void
2174 Precond<PrecVector>::calculateSpikes_const(PrecVector& WV)
2175 {
2176  PrecValueType* p_WV = thrust::raw_pointer_cast(&WV[0]);
2177 
2178 
2179  // Calculate the size of the first and last partitions.
2180  int last_partition_size = m_n / m_numPartitions;
2181  int first_partition_size = last_partition_size;
2182 
2183  if (m_n % m_numPartitions != 0)
2184  first_partition_size++;
2185 
2186 
2187  // Copy WV into extV, perform sweeps to calculate extV, then copy back extV to WV.
2188  // Note that we skip the last partition (no right spike associated with it). Also
2189  // note that we only perform truncated spikes using the bottom parts of the L and
2190  // U factors to calculate the bottom block of the right spikes V.
2191  {
2192  int n_eff = m_n - last_partition_size;
2193  int numPart_eff = m_numPartitions - 1;
2194  int partSize = n_eff / numPart_eff;
2195  int remainder = n_eff % numPart_eff;
2196 
2197  PrecVector extV(m_k * n_eff, (PrecValueType) 0);
2198 
2199  PrecValueType* p_extV = thrust::raw_pointer_cast(&extV[0]);
2200  PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[0]);
2201 
2202  dim3 gridsCopy(m_k, numPart_eff);
2203  dim3 gridsSweep(numPart_eff, m_k);
2204 
2205  if (m_k > 1024) {
2206  device::copyWVFromOrToExtendedV_general<PrecValueType><<<gridsCopy, 512>>>(n_eff, m_k, partSize, remainder, p_WV, p_extV, false);
2207  device::forwardElimL_bottom_general<PrecValueType><<<gridsSweep, 512>>>(n_eff, m_k, m_k, p_B, p_extV, partSize, remainder);
2208  device::backwardElimU_bottom_general<PrecValueType><<<gridsSweep, 512>>>(n_eff, m_k, 2*m_k, p_B, p_extV, partSize, remainder);
2209  device::copyWVFromOrToExtendedV_general<PrecValueType><<<gridsCopy, 512>>>(n_eff, m_k, partSize, remainder, p_WV, p_extV, true);
2210  } else if (m_k > 32) {
2211  device::copyWVFromOrToExtendedV<PrecValueType><<<gridsCopy, m_k>>>(n_eff, m_k, partSize, remainder, p_WV, p_extV, false);
2212  device::forwardElimL_bottom_g32<PrecValueType><<<gridsSweep, m_k>>>(n_eff, m_k, m_k, p_B, p_extV, partSize, remainder);
2213  device::backwardElimU_bottom_g32<PrecValueType><<<gridsSweep, m_k>>>(n_eff, m_k, 2*m_k, p_B, p_extV, partSize, remainder);
2214  device::copyWVFromOrToExtendedV<PrecValueType><<<gridsCopy, m_k>>>(n_eff, m_k, partSize, remainder, p_WV, p_extV, true);
2215  } else {
2216  device::copyWVFromOrToExtendedV<PrecValueType><<<gridsCopy, m_k>>>(n_eff, m_k, partSize, remainder, p_WV, p_extV, false);
2217  device::forwardElimL_bottom<PrecValueType><<<gridsSweep, m_k>>>(n_eff, m_k, m_k, p_B, p_extV, partSize, remainder);
2218  device::backwardElimU_bottom<PrecValueType><<<gridsSweep, m_k>>>(n_eff, m_k, 2*m_k, p_B, p_extV, partSize, remainder);
2219  device::copyWVFromOrToExtendedV<PrecValueType><<<gridsCopy, m_k>>>(n_eff, m_k, partSize, remainder, p_WV, p_extV, true);
2220  }
2221  }
2222 
2223 
2224  // Copy WV into extW, perform sweeps to calculate extW, then copy back extW to WV.
2225  // Note that we skip the first partition (no left spike associated with it). Also
2226  // note that we perform full sweeps using the L and U factors to calculate the
2227  // entire left spikes W.
2228  {
2229  int n_eff = m_n - first_partition_size;
2230  int numPart_eff = m_numPartitions - 1;
2231  int partSize = n_eff / numPart_eff;
2232  int remainder = n_eff % numPart_eff;
2233 
2234  PrecVector extW(m_k * n_eff, (PrecValueType) 0);
2235 
2236  PrecValueType* p_extW = thrust::raw_pointer_cast(&extW[0]);
2237  PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[(2*m_k+1)*first_partition_size]);
2238 
2239  dim3 gridsSweep(numPart_eff, m_k);
2240  dim3 gridsCopy(m_k, numPart_eff);
2241 
2242  if (m_k > 1024) {
2243  device::copyWVFromOrToExtendedW_general<PrecValueType><<<gridsCopy, 512>>>(n_eff, m_k, partSize, remainder, p_WV, p_extW, false);
2244  device::forwardElimL_general<PrecValueType><<<gridsSweep, 512>>>(n_eff, m_k, p_B, p_extW, partSize, remainder);
2245  device::backwardElimU_general<PrecValueType><<<gridsSweep, 512>>>(n_eff, m_k, p_B, p_extW, partSize, remainder);
2246  device::copyWVFromOrToExtendedW_general<PrecValueType><<<gridsCopy, 512>>>(n_eff, m_k, partSize, remainder, p_WV, p_extW, true);
2247  } else if (m_k > 32) {
2248  device::copyWVFromOrToExtendedW<PrecValueType><<<gridsCopy, m_k>>>(n_eff, m_k, partSize, remainder, p_WV, p_extW, false);
2249  device::forwardElimL_g32<PrecValueType><<<gridsSweep, m_k>>>(n_eff, m_k, p_B, p_extW, partSize, remainder);
2250  device::backwardElimU_g32<PrecValueType><<<gridsSweep, m_k>>>(n_eff, m_k, p_B, p_extW, partSize, remainder);
2251  device::copyWVFromOrToExtendedW<PrecValueType><<<gridsCopy, m_k>>>(n_eff, m_k, partSize, remainder, p_WV, p_extW, true);
2252  } else {
2253  device::copyWVFromOrToExtendedW<PrecValueType><<<gridsCopy, m_k>>>(n_eff, m_k, partSize, remainder, p_WV, p_extW, false);
2254  device::forwardElimL<PrecValueType><<<gridsSweep, m_k>>>(n_eff, m_k, p_B, p_extW, partSize, remainder);
2255  device::backwardElimU<PrecValueType><<<gridsSweep, m_k>>>(n_eff, m_k, p_B, p_extW, partSize, remainder);
2256  device::copyWVFromOrToExtendedW<PrecValueType><<<gridsCopy, m_k>>>(n_eff, m_k, partSize, remainder, p_WV, p_extW, true);
2257  }
2258  }
2259 }
2260 
2261 template <typename PrecVector>
2262 void
2263 Precond<PrecVector>::calculateSpikes_var(PrecVector& WV)
2264 {
2265  PrecVector WV_spare(m_k*m_k);
2266 
2267  PrecValueType* p_WV = thrust::raw_pointer_cast(&WV[0]);
2268  PrecValueType* p_WV_spare = thrust::raw_pointer_cast(&WV_spare[0]);
2269 
2270  // Calculate the size of the first and last partitions.
2271  int numThreadsToUse = adjustNumThreads(cusp::blas::nrm1(m_ks) / m_numPartitions);
2272 
2273  const int SWEEP_MAX_NUM_THREADS = 128;
2274  // Copy WV into extV, perform sweeps to calculate extV, then copy back extV to WV.
2275  // Note that we skip the last partition (no right spike associated with it). Also
2276  // note that we only perform truncated spikes using the bottom parts of the L and
2277  // U factors to calculate the bottom block of the right spikes V.
2278  {
2279  int n_eff = m_n;
2280  int numPart_eff = m_numPartitions;
2281  int partSize = n_eff / numPart_eff;
2282  int remainder = n_eff % numPart_eff;
2283  int rightOffDiagWidth = cusp::blas::nrmmax(m_offDiagWidths_right);
2284  int leftOffDiagWidth = cusp::blas::nrmmax(m_offDiagWidths_left);
2285 
2286  PrecVector extWV((leftOffDiagWidth + rightOffDiagWidth) * n_eff, (PrecValueType) 0);
2287  PrecVector buffer;
2288 
2289  PrecValueType* p_extWV = thrust::raw_pointer_cast(&extWV[0]);
2290  PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[0]);
2291  int* p_secondReordering = thrust::raw_pointer_cast(&m_secondReordering[0]);
2292  int* p_secondPerm = thrust::raw_pointer_cast(&m_secondPerm[0]);
2293  int* p_ks = thrust::raw_pointer_cast(&m_ks[0]);
2294  int* p_offDiagWidths_right = thrust::raw_pointer_cast(&m_offDiagWidths_right[0]);
2295  int* p_offDiagPerms_right = thrust::raw_pointer_cast(&m_offDiagPerms_right[0]);
2296  int* p_offDiagWidths_left = thrust::raw_pointer_cast(&m_offDiagWidths_left[0]);
2297  int* p_offDiagPerms_left = thrust::raw_pointer_cast(&m_offDiagPerms_left[0]);
2298  int* p_first_rows = thrust::raw_pointer_cast(&m_first_rows[0]);
2299  int* p_offsets = thrust::raw_pointer_cast(&m_BOffsets[0]);
2300 
2301  int permuteBlockX = leftOffDiagWidth+rightOffDiagWidth;
2302  int permuteGridX = 1;
2303  int permuteGridY = n_eff;
2304  int permuteGridZ = 1;
2305  kernelConfigAdjust(permuteBlockX, permuteGridX, BLOCK_SIZE);
2306  kernelConfigAdjust(permuteGridY, permuteGridZ, MAX_GRID_DIMENSION);
2307  dim3 gridsPermute(permuteGridX, permuteGridY, permuteGridZ);
2308 
2309  buffer.resize((leftOffDiagWidth + rightOffDiagWidth) * n_eff);
2310 
2311  PrecValueType* p_buffer = thrust::raw_pointer_cast(&buffer[0]);
2312 
2313  dim3 gridsCopy((leftOffDiagWidth + rightOffDiagWidth), numPart_eff);
2314 
2315  device::copyWVFromOrToExtendedWVTranspose_general<PrecValueType><<<gridsCopy, numThreadsToUse>>>(leftOffDiagWidth + rightOffDiagWidth, m_k, rightOffDiagWidth, partSize, remainder, m_k-rightOffDiagWidth-leftOffDiagWidth, p_WV, p_extWV, false);
2316  device::columnPermute<PrecValueType><<<gridsPermute, permuteBlockX>>>(n_eff, leftOffDiagWidth+rightOffDiagWidth, p_extWV, p_buffer, p_secondPerm);
2317 
2318  {
2319  int sweepBlockX = leftOffDiagWidth;
2320  int sweepGridX = 1;
2321  if (sweepBlockX < rightOffDiagWidth)
2322  sweepBlockX = rightOffDiagWidth;
2323  kernelConfigAdjust(sweepBlockX, sweepGridX, SWEEP_MAX_NUM_THREADS);
2324  dim3 sweepGrids(sweepGridX, 2*numPart_eff-2);
2325 
2326  device::var::fwdElim_spike<PrecValueType><<<sweepGrids, sweepBlockX>>>(n_eff, p_ks, leftOffDiagWidth + rightOffDiagWidth, rightOffDiagWidth, p_offsets, p_B, p_buffer, partSize, remainder, p_offDiagWidths_left, p_offDiagWidths_right, p_first_rows);
2327 
2328  int preBckBlockX = leftOffDiagWidth + rightOffDiagWidth;
2329  int preBckGridX = 1;
2330  int preBckGridY = n_eff;
2331  int preBckGridZ = 1;
2332  kernelConfigAdjust(preBckBlockX, preBckGridX, BLOCK_SIZE);
2333  kernelConfigAdjust(preBckGridY, preBckGridZ, MAX_GRID_DIMENSION);
2334  dim3 preBckGrids(preBckGridX, preBckGridY, preBckGridZ);
2335 
2336  device::var::preBck_offDiag_divide<PrecValueType><<<preBckGrids, preBckBlockX>>>(n_eff, leftOffDiagWidth + rightOffDiagWidth, p_ks, p_offsets, p_B, p_buffer, partSize, remainder);
2337 
2338  {
2339  int last_row = 0;
2340  for (int i = 0; i < m_numPartitions - 1; i++) {
2341  if (i < remainder)
2342  last_row += (partSize + 1);
2343  else
2344  last_row += partSize;
2345 
2346  m_first_rows_host[i] = thrust::reduce(m_secondPerm.begin()+(last_row-m_k), m_secondPerm.begin()+last_row, last_row, thrust::minimum<int>());
2347  }
2348  m_first_rows = m_first_rows_host;
2349  p_first_rows = thrust::raw_pointer_cast(&m_first_rows[0]);
2350  }
2351 
2352  device::var::bckElim_spike<PrecValueType><<<sweepGrids, sweepBlockX>>>(n_eff, p_ks, leftOffDiagWidth + rightOffDiagWidth, rightOffDiagWidth, p_offsets, p_B, p_buffer, partSize, remainder, p_offDiagWidths_left, p_offDiagWidths_right, p_first_rows);
2353  }
2354 
2355 
2356  device::columnPermute<PrecValueType><<<gridsPermute, permuteBlockX>>>(n_eff, leftOffDiagWidth + rightOffDiagWidth, p_buffer, p_extWV, p_secondReordering);
2357  device::copyWVFromOrToExtendedWVTranspose_general<PrecValueType><<<gridsCopy, numThreadsToUse>>>(leftOffDiagWidth + rightOffDiagWidth, m_k, rightOffDiagWidth, partSize, remainder, m_k-rightOffDiagWidth-leftOffDiagWidth, p_WV, p_extWV, true);
2358 
2359  for (int i = 0; i < numPart_eff - 1; i++) {
2360  cusp::blas::fill(WV_spare, (PrecValueType) 0);
2361  device::matrixVReordering_perPartition<PrecValueType><<<m_offDiagWidths_right_host[i], numThreadsToUse>>>(m_k, p_WV+2*i*m_k*m_k, p_WV_spare, p_offDiagPerms_right+i*m_k);
2362  thrust::copy(WV_spare.begin(), WV_spare.end(), WV.begin()+(2*i*m_k*m_k));
2363 
2364  cusp::blas::fill(WV_spare, (PrecValueType) 0);
2365  device::matrixWReordering_perPartition<PrecValueType><<<m_offDiagWidths_left_host[i], numThreadsToUse>>>(m_k, p_WV+(2*i+1)*m_k*m_k, p_WV_spare, p_offDiagPerms_left+i*m_k);
2366  thrust::copy(WV_spare.begin(), WV_spare.end(), WV.begin()+((2*i+1)*m_k*m_k));
2367  }
2368  }
2369 }
2370 
2375 template <typename PrecVector>
2376 int
2377 Precond<PrecVector>::adjustNumThreads(int inNumThreads) {
2378  int prev = 0;
2379  int cur;
2380 
2381  for (int i = 0; i < 16; i++) {
2382  cur = (i+1) << 5;
2383  if (inNumThreads > cur) {
2384  prev = cur;
2385  continue;
2386  }
2387  if (inNumThreads - prev > cur - inNumThreads || prev == 0)
2388  return cur;
2389  return prev;
2390  }
2391  return 512;
2392 }
2393 
2397 template <typename PrecVector>
2398 void
2399 Precond<PrecVector>::calculateSpikes(PrecVector& B2,
2400  PrecVector& WV)
2401 {
2402  int two_k = 2 * m_k;
2403  int partSize = m_n / m_numPartitions;
2404  int remainder = m_n % m_numPartitions;
2405 
2406  // Compress the provided UL factorization 'B2' into 'compB2'.
2407  PrecVector compB2((two_k+1)*two_k*(m_numPartitions-1));
2408  cusp::blas::fill(compB2, (PrecValueType) 0);
2409 
2410  PrecValueType* p_B2 = thrust::raw_pointer_cast(&B2[0]);
2411  PrecValueType* p_compB2 = thrust::raw_pointer_cast(&compB2[0]);
2412 
2413  dim3 gridsCompress(two_k, m_numPartitions-1);
2414 
2415  if (m_k > 511)
2416  device::copydAtodA2_general<PrecValueType><<<gridsCompress, 1024>>>(m_n, m_k, p_B2, p_compB2, two_k, partSize, m_numPartitions, remainder);
2417  else
2418  device::copydAtodA2<PrecValueType><<<gridsCompress, two_k+1>>>(m_n, m_k, p_B2, p_compB2, two_k, partSize, m_numPartitions, remainder);
2419 
2420  // Combine 'B' and 'compB2' into 'partialB'.
2421  PrecVector partialB(2*(two_k+1)*(m_k+1)*(m_numPartitions-1));
2422 
2423  PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[0]);
2424  PrecValueType* p_partialB = thrust::raw_pointer_cast(&partialB[0]);
2425 
2426  dim3 gridsCopy(m_k+1, 2*(m_numPartitions-1));
2427 
2428  if (m_k > 511)
2429  device::copydAtoPartialA_general<PrecValueType><<<gridsCopy, 1024>>>(m_n, m_k, p_B, p_compB2, p_partialB, partSize, m_numPartitions, remainder, two_k);
2430  else
2431  device::copydAtoPartialA<PrecValueType><<<gridsCopy, two_k+1>>>(m_n, m_k, p_B, p_compB2, p_partialB, partSize, m_numPartitions, remainder, two_k);
2432 
2433  // Perform forward/backward sweeps to calculate the spike blocks 'W' and 'V'.
2434  PrecValueType* p_WV = thrust::raw_pointer_cast(&WV[0]);
2435 
2436  dim3 gridsSweep(m_numPartitions-1, m_k);
2437 
2438  if (m_k > 1024) {
2439  device::forwardElimLdWV_general<PrecValueType><<<gridsSweep, 512>>>(m_k, p_partialB, p_WV, m_k, 0, 0);
2440  device::backwardElimUdWV_general<PrecValueType><<<gridsSweep, 512>>>(m_k, p_partialB, p_WV, m_k, 0, 1);
2441  device::backwardElimUdWV_general<PrecValueType><<<gridsSweep, 512>>>(m_k, p_partialB, p_WV, m_k, 1, 0);
2442  device::forwardElimLdWV_general<PrecValueType><<<gridsSweep, 512>>>(m_k, p_partialB, p_WV, m_k, 1, 1);
2443  } else if (m_k > 32) {
2444  device::forwardElimLdWV_g32<PrecValueType><<<gridsSweep, m_k>>>(m_k, p_partialB, p_WV, m_k, 0, 0);
2445  device::backwardElimUdWV_g32<PrecValueType><<<gridsSweep, m_k>>>(m_k, p_partialB, p_WV, m_k, 0, 1);
2446  device::backwardElimUdWV_g32<PrecValueType><<<gridsSweep, m_k>>>(m_k, p_partialB, p_WV, m_k, 1, 0);
2447  device::forwardElimLdWV_g32<PrecValueType><<<gridsSweep, m_k>>>(m_k, p_partialB, p_WV, m_k, 1, 1);
2448  } else {
2449  device::forwardElimLdWV<PrecValueType><<<gridsSweep, m_k>>>(m_k, p_partialB, p_WV, m_k, 0, 0);
2450  device::backwardElimUdWV<PrecValueType><<<gridsSweep, m_k>>>(m_k, p_partialB, p_WV, m_k, 0, 1);
2451  device::backwardElimUdWV<PrecValueType><<<gridsSweep, m_k>>>(m_k, p_partialB, p_WV, m_k, 1, 0);
2452  device::forwardElimLdWV<PrecValueType><<<gridsSweep, m_k>>>(m_k, p_partialB, p_WV, m_k, 1, 1);
2453  }
2454 }
2455 
2459 template <typename PrecVector>
2460 void
2462 {
2463  PrecValueType* p_WV = thrust::raw_pointer_cast(&WV[0]);
2464  PrecValueType* p_R = thrust::raw_pointer_cast(&m_R[0]);
2465 
2466  dim3 grids(m_k, m_numPartitions-1);
2467 
2468  if (!m_variableBandwidth) {
2469  if (m_k > 1024)
2470  device::assembleReducedMat_general<PrecValueType><<<grids, 512>>>(m_k, p_WV, p_R);
2471  else if (m_k > 32)
2472  device::assembleReducedMat_g32<PrecValueType><<<grids, m_k>>>(m_k, p_WV, p_R);
2473  else
2474  device::assembleReducedMat<PrecValueType><<<m_numPartitions-1, m_k*m_k>>>(m_k, p_WV, p_R);
2475  } else {
2476  int* p_WVOffsets = thrust::raw_pointer_cast(&m_WVOffsets[0]);
2477  int* p_ROffsets = thrust::raw_pointer_cast(&m_ROffsets[0]);
2478  int* p_spike_ks = thrust::raw_pointer_cast(&m_spike_ks[0]);
2479 
2480  if (m_k > 1024)
2481  device::assembleReducedMat_var_bandwidth_general<PrecValueType><<<grids, 512>>>(p_spike_ks, p_WVOffsets, p_ROffsets, p_WV, p_R);
2482  else if (m_k > 32)
2483  device::assembleReducedMat_var_bandwidth_g32<PrecValueType><<<grids, m_k>>>(p_spike_ks, p_WVOffsets, p_ROffsets, p_WV, p_R);
2484  else
2485  device::assembleReducedMat_var_bandwidth<PrecValueType><<<m_numPartitions-1, m_k*m_k>>>(p_spike_ks, p_WVOffsets, p_ROffsets, p_WV, p_R);
2486  }
2487 }
2488 
2493 template <typename PrecVector>
2494 void
2495 Precond<PrecVector>::copyLastPartition(PrecVector &B2) {
2496  thrust::copy(B2.begin()+(2*m_k+1) * (m_n - m_n / m_numPartitions), B2.end(), m_B.begin()+(2*m_k+1) * (m_n - m_n / m_numPartitions) );
2497 }
2498 
2499 
2504 template <typename PrecVector>
2505 bool
2506 Precond<PrecVector>::hasZeroPivots(const PrecVectorIterator& start_B,
2507  const PrecVectorIterator& end_B,
2508  int k,
2509  PrecValueType threshold)
2510 {
2511  // Create a strided range to select the main diagonal
2512  strided_range<typename PrecVector::iterator> diag(start_B + k, end_B, 2*k + 1);
2513 
2517 
2518  // Check if any of the diagonal elements is within the specified threshold
2519  return thrust::any_of(diag.begin(), diag.end(), SmallerThan<PrecValueType>(threshold));
2520 }
2521 
2522 
2523 
2524 } // namespace spike
2525 
2526 
2527 #endif
double gettimeAssembly() const
Definition: precond.h:84
int getNumPartitions() const
Definition: precond.h:92
cusp::array1d< int, MemorySpace > IntVector
Definition: precond.h:44
Precond()
Definition: precond.h:312
Definition: common.h:67
double getTimeFullLU() const
Definition: precond.h:85
const unsigned int MAX_GRID_DIMENSION
Definition: common.h:51
cusp::array1d< int, cusp::host_memory > IntVectorH
Definition: precond.h:49
IntVector MatrixMap
Definition: precond.h:45
IntVectorH MatrixMapH
Definition: precond.h:50
double getTimeShuffle() const
Definition: precond.h:86
Definition: common.h:62
double getTimeTransfer() const
Definition: precond.h:79
Definition: exception.h:18
Definition: common.h:61
double getTimeBandUL() const
Definition: precond.h:83
GPU timer.
Definition: timer.h:35
CPU and GPU timer classes.
void setup(const Matrix &A)
Definition: precond.h:565
FactorizationMethod
Definition: common.h:60
PrecVector::memory_space MemorySpace
Definition: precond.h:40
PreconditionerType
Definition: common.h:65
int getBandwidthMC64() const
Definition: precond.h:89
cusp::array1d< PrecValueType, cusp::host_memory > MatrixMapFH
Definition: precond.h:51
#define BURST_VALUE
Definition: common.h:43
int getBandwidthReordering() const
Definition: precond.h:88
const unsigned int CRITICAL_THRESHOLD
Definition: common.h:53
cusp::array1d< PrecValueType, cusp::host_memory > PrecVectorH
Definition: precond.h:48
double getTimeCopyOffDiags() const
Definition: precond.h:81
~Precond()
Definition: precond.h:73
double getTimeCPUAssemble() const
Definition: precond.h:78
int getBandwidth() const
Definition: precond.h:90
cusp::coo_matrix< int, PrecValueType, cusp::host_memory > PrecMatrixCooH
Definition: precond.h:54
PrecVector::iterator PrecVectorIterator
Definition: precond.h:42
__global__ void permute(int N, T *ori_array, T *final_array, int *per_array)
Definition: shuffle.cuh:12
__host__ __device__ bool operator()(T val)
Definition: precond.h:260
cusp::array1d< PrecValueType, MemorySpace > MatrixMapF
Definition: precond.h:46
Spike preconditioner.
Definition: precond.h:37
Precond & operator=(const Precond &prec)
Definition: precond.h:368
void update(const PrecVector &entries)
This function updates the banded matrix and off-diagonal matrices based on the given entries...
Definition: precond.h:406
cusp::array1d< int, cusp::host_memory > IntVectorH
Definition: bicgstab2.h:20
Definition: precond.h:255
T m_threshold
Definition: precond.h:262
double getTimeReorder() const
Definition: precond.h:77
Definition: common.h:66
void solve(PrecVector &v, PrecVector &z)
Definition: precond.h:689
void kernelConfigAdjust(int &numThreads, int &numBlockX, const int numThreadsMax)
Definition: common.h:71
double getActualDropOff() const
Definition: precond.h:93
Definition: precond.h:246
cusp::coo_matrix< int, PrecValueType, MemorySpace > PrecMatrixCoo
Definition: precond.h:53
const unsigned int BLOCK_SIZE
Definition: common.h:49
__global__ void assembleReducedMat(int k, T *dWV, T *d_comp)
Definition: data_transfer.cuh:16
double getTimeBandLU() const
Definition: precond.h:82
__host__ __device__ T operator()(thrust::tuple< T, T > tu)
Definition: precond.h:249
PrecVector::value_type PrecValueType
Definition: precond.h:41
double getTimeToBanded() const
Definition: precond.h:80
SmallerThan(T threshold)
Definition: precond.h:257