5 #ifndef SPIKE_PRECOND_CUH
6 #define SPIKE_PRECOND_CUH
9 #include <cusp/print.h>
11 #include <thrust/logical.h>
12 #include <thrust/functional.h>
36 template <
typename PrecVector>
44 typedef typename cusp::array1d<int, MemorySpace>
IntVector;
46 typedef typename cusp::array1d<PrecValueType, MemorySpace>
MatrixMapF;
48 typedef typename cusp::array1d<PrecValueType, cusp::host_memory>
PrecVectorH;
49 typedef typename cusp::array1d<int, cusp::host_memory>
IntVectorH;
51 typedef typename cusp::array1d<PrecValueType, cusp::host_memory>
MatrixMapFH;
53 typedef typename cusp::coo_matrix<int, PrecValueType, MemorySpace>
PrecMatrixCoo;
54 typedef typename cusp::coo_matrix<int, PrecValueType, cusp::host_memory>
PrecMatrixCooH;
67 bool safeFactorization,
68 bool variableBandwidth,
69 bool trackReordering);
98 template <
typename Matrix>
99 void setup(
const Matrix& A);
101 void update(
const PrecVector& entries);
103 void solve(PrecVector& v, PrecVector& z);
117 bool m_safeFactorization;
118 bool m_variableBandwidth;
119 bool m_trackReordering;
157 PrecVector m_mc64RowScale;
158 PrecVector m_mc64ColScale;
161 PrecVector m_offDiags;
173 double m_time_reorder;
174 double m_time_cpu_assemble;
175 double m_time_transfer;
176 double m_time_toBanded;
177 double m_time_offDiags;
178 double m_time_bandLU;
179 double m_time_bandUL;
180 double m_time_assembly;
181 double m_time_fullLU;
182 double m_time_shuffle;
184 template <
typename Matrix>
185 void transformToBandedMatrix(
const Matrix& A);
187 template <
typename Matrix>
188 void convertToBandedMatrix(
const Matrix& A);
190 void extractOffDiagonal(PrecVector& mat_WV);
193 void partBandedLU_const();
194 void partBandedLU_one();
195 void partBandedLU_var();
196 void partBandedUL(PrecVector& B);
198 void partBandedFwdSweep(PrecVector& v);
199 void partBandedFwdSweep_const(PrecVector& v);
200 void partBandedFwdSweep_var(PrecVector& v);
201 void partBandedBckSweep(PrecVector& v);
202 void partBandedBckSweep_const(PrecVector& v);
203 void partBandedBckSweep_var(PrecVector& v);
206 void partFullLU_const();
207 void partFullLU_var();
209 void partFullFwdSweep(PrecVector& v);
210 void partFullBckSweep(PrecVector& v);
211 void purifyRHS(PrecVector& v, PrecVector& res);
213 void calculateSpikes(PrecVector& WV);
214 void calculateSpikes_const(PrecVector& WV);
215 void calculateSpikes_var(PrecVector& WV);
216 void calculateSpikes_var_old(PrecVector& WV);
217 void calculateSpikes(PrecVector& B2, PrecVector& WV);
219 int adjustNumThreads(
int inNumThreads);
222 void assembleReducedMat(PrecVector& WV);
224 void copyLastPartition(PrecVector& B2);
226 void leftTrans(PrecVector& v, PrecVector& z);
227 void rightTrans(PrecVector& v, PrecVector& z);
228 void permute(PrecVector& v,
IntVector& perm, PrecVector& w);
229 void permuteAndScale(PrecVector& v,
IntVector& perm, PrecVector& scale, PrecVector& w);
230 void scaleAndPermute(PrecVector& v,
IntVector& perm, PrecVector& scale, PrecVector& w);
233 void getSRev(PrecVector& rhs, PrecVector& sol);
246 struct Multiply:
public thrust::unary_function<T, T>
250 return thrust::get<0>(tu) * thrust::get<1>(tu);
254 template <
typename T>
269 template <
typename PrecVector>
278 bool safeFactorization,
279 bool variableBandwidth,
280 bool trackReordering)
281 : m_numPartitions(numPart),
286 m_maxBandwidth(maxBandwidth),
287 m_factMethod(factMethod),
288 m_precondType(precondType),
289 m_safeFactorization(safeFactorization),
290 m_variableBandwidth(variableBandwidth),
291 m_trackReordering(trackReordering),
296 m_time_cpu_assemble(0),
311 template <
typename PrecVector>
319 m_maxBandwidth(std::numeric_limits<int>::max()),
321 m_time_cpu_assemble(0),
336 template <
typename PrecVector>
342 m_time_cpu_assemble(0),
352 m_numPartitions = prec.m_numPartitions;
354 m_reorder = prec.m_reorder;
355 m_doMC64 = prec.m_doMC64;
356 m_scale = prec.m_scale;
357 m_dropOff_frac = prec.m_dropOff_frac;
358 m_maxBandwidth = prec.m_maxBandwidth;
359 m_factMethod = prec.m_factMethod;
360 m_precondType = prec.m_precondType;
361 m_safeFactorization = prec.m_safeFactorization;
362 m_variableBandwidth = prec.m_variableBandwidth;
363 m_trackReordering = prec.m_trackReordering;
366 template <
typename PrecVector>
370 m_numPartitions = prec.m_numPartitions;
372 m_reorder = prec.m_reorder;
373 m_doMC64 = prec.m_doMC64;
374 m_scale = prec.m_scale;
375 m_dropOff_frac = prec.m_dropOff_frac;
376 m_maxBandwidth = prec.m_maxBandwidth;
377 m_factMethod = prec.m_factMethod;
378 m_precondType = prec.m_precondType;
379 m_safeFactorization = prec.m_safeFactorization;
380 m_variableBandwidth = prec.m_variableBandwidth;
381 m_trackReordering = prec.m_trackReordering;
383 m_ks_host = prec.m_ks_host;
384 m_offDiagWidths_left_host = prec.m_offDiagWidths_left_host;
385 m_offDiagWidths_right_host = prec.m_offDiagWidths_right_host;
386 m_first_rows_host = prec.m_first_rows_host;
387 m_BOffsets_host = prec.m_BOffsets_host;
404 template <
typename PrecVector>
408 m_time_reorder = 0.0;
416 thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(entries.begin(), m_scaleMap.begin())),
Multiply<PrecValueType>()),
417 thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(entries.end(), m_scaleMap.end())),
Multiply<PrecValueType>()),
418 m_bandedMatMap.begin(),
423 m_time_cpu_assemble = m_timer.getElapsed();
425 m_time_transfer = 0.0;
434 if (m_precondType ==
Block || m_numPartitions == 1) {
439 m_time_bandLU = m_timer.getElapsed();
448 m_R.resize((2 * m_k) * (2 * m_k) * (m_numPartitions - 1));
453 mat_WV.resize(2 * m_k * m_k * (m_numPartitions-1));
459 thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(entries.begin(), m_scaleMap.begin())),
Multiply<PrecValueType>()),
460 thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(entries.end(), m_scaleMap.end())),
Multiply<PrecValueType>()),
461 m_offDiagMap.begin(),
464 thrust::logical_not<int>()
468 thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(entries.begin(), m_scaleMap.begin())),
Multiply<PrecValueType>()),
469 thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(entries.end(), m_scaleMap.end())),
Multiply<PrecValueType>()),
473 thrust::logical_not<int>()
476 m_time_offDiags = m_timer.getElapsed();
479 switch (m_factMethod) {
489 m_time_bandLU = m_timer.getElapsed();
494 calculateSpikes(mat_WV);
497 m_time_assembly = m_timer.getElapsed();
512 cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
517 m_time_bandLU = m_timer.getElapsed();
524 m_time_bandUL = m_timer.getElapsed();
528 cudaDeviceSetCacheConfig(cudaFuncCachePreferNone);
531 calculateSpikes(B2, mat_WV);
533 copyLastPartition(B2);
535 m_time_assembly = m_timer.getElapsed();
549 m_time_fullLU = m_timer.getElapsed();
562 template <
typename PrecVector>
563 template <
typename Matrix>
572 transformToBandedMatrix(A);
574 convertToBandedMatrix(A);
583 if (m_precondType ==
Block || m_numPartitions == 1) {
588 m_time_bandLU = m_timer.getElapsed();
597 m_R.resize((2 * m_k) * (2 * m_k) * (m_numPartitions - 1));
602 mat_WV.resize(2 * m_k * m_k * (m_numPartitions-1));
605 extractOffDiagonal(mat_WV);
607 m_time_offDiags = m_timer.getElapsed();
610 switch (m_factMethod) {
620 m_time_bandLU = m_timer.getElapsed();
625 calculateSpikes(mat_WV);
628 m_time_assembly = m_timer.getElapsed();
643 cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
648 m_time_bandLU = m_timer.getElapsed();
655 m_time_bandUL = m_timer.getElapsed();
659 cudaDeviceSetCacheConfig(cudaFuncCachePreferNone);
662 calculateSpikes(B2, mat_WV);
664 copyLastPartition(B2);
666 m_time_assembly = m_timer.getElapsed();
680 m_time_fullLU = m_timer.getElapsed();
687 template <
typename PrecVector>
694 static PrecVector buffer;
697 rightTrans(buffer, z);
699 cusp::blas::copy(v, z);
700 PrecVector buffer = z;
708 template <
typename PrecVector>
714 thrust::transform(rhs.begin(), rhs.end(), m_B.begin(), sol.begin(), thrust::divides<PrecValueType>());
718 if (m_numPartitions > 1 && m_precondType ==
Spike) {
719 if (!m_variableBandwidth) {
722 partBandedFwdSweep(rhs);
723 partBandedBckSweep(rhs);
726 partFullFwdSweep(rhs);
727 partFullBckSweep(rhs);
732 static PrecVector buffer;
734 permute(rhs, m_secondReordering,buffer);
736 partBandedFwdSweep(rhs);
737 partBandedBckSweep(rhs);
739 permute(rhs, m_secondReordering, sol);
742 partFullFwdSweep(sol);
743 partFullBckSweep(sol);
745 purifyRHS(sol, buffer);
746 permute(buffer, m_secondPerm, sol);
752 partBandedFwdSweep(sol);
753 partBandedBckSweep(sol);
761 template <
typename PrecVector>
763 Precond<PrecVector>::leftTrans(PrecVector& v,
767 scaleAndPermute(v, m_optPerm, m_mc64RowScale, z);
776 template <
typename PrecVector>
778 Precond<PrecVector>::rightTrans(PrecVector& v,
782 permuteAndScale(v, m_optReordering, m_mc64ColScale, z);
784 permute(v, m_optReordering, z);
791 template <
typename PrecVector>
798 thrust::scatter(v.begin(), v.end(), perm.begin(), w.begin());
800 m_time_shuffle += m_timer.getElapsed();
807 template <
typename PrecVector>
809 Precond<PrecVector>::permuteAndScale(PrecVector& v,
817 thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(v.begin(), thrust::make_permutation_iterator(scale.begin(), perm.begin()))), Multiply<PrecValueType>()),
818 thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(v.end(), thrust::make_permutation_iterator(scale.end(), perm.end()))), Multiply<PrecValueType>()),
824 m_time_shuffle += m_timer.getElapsed();
831 template <
typename PrecVector>
833 Precond<PrecVector>::scaleAndPermute(PrecVector& v,
840 thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(v.begin(), scale.begin())), Multiply<PrecValueType>()),
841 thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(v.end(), scale.end())), Multiply<PrecValueType>()),
846 m_time_shuffle += m_timer.getElapsed();
852 template <
typename PrecVector>
854 Precond<PrecVector>::combinePermutation(IntVector& perm,
856 IntVector& finalPerm)
859 thrust::gather(perm.begin(), perm.end(), perm2.begin(), finalPerm.begin());
861 m_time_shuffle += m_timer.getElapsed();
882 template <
typename PrecVector>
883 template <
typename Matrix>
885 Precond<PrecVector>::transformToBandedMatrix(
const Matrix& A)
887 CPUTimer reorder_timer, assemble_timer, transfer_timer;
889 transfer_timer.Start();
893 PrecMatrixCooH Acoo = A;
894 transfer_timer.Stop();
895 m_time_transfer = transfer_timer.getElapsed();
901 PrecVectorH mc64RowScale;
902 PrecVectorH mc64ColScale;
906 Graph<PrecValueType> graph(m_trackReordering);
911 MatrixMapH offDiagMap;
914 MatrixMapH bandedMatMap;
915 MatrixMapFH scaleMap;
918 reorder_timer.Start();
919 m_k_reorder = graph.reorder(Acoo, m_doMC64, m_scale, optReordering, optPerm, mc64RowPerm, mc64RowScale, mc64ColScale, scaleMap, m_k_mc64);
920 reorder_timer.Stop();
922 m_time_reorder += reorder_timer.getElapsed();
926 if (m_k_reorder > m_maxBandwidth || m_dropOff_frac > 0)
927 dropped = graph.dropOff(m_dropOff_frac, m_maxBandwidth, m_dropOff_actual);
929 m_dropOff_actual = 0;
932 m_k = m_k_reorder - dropped;
940 int maxNumPartitions = std::max(m_n / (m_k + 1), 1);
941 m_numPartitions = std::min(m_numPartitions, maxNumPartitions);
944 if (m_numPartitions == 1 || m_k == 0)
945 m_variableBandwidth =
false;
949 if (m_variableBandwidth) {
950 assemble_timer.Start();
951 graph.assembleOffDiagMatrices(m_k, m_numPartitions, m_WV_host, m_offDiags_host, m_offDiagWidths_left_host, m_offDiagWidths_right_host, offDiagPerms_left, offDiagPerms_right, typeMap, offDiagMap, WVMap);
952 assemble_timer.Stop();
953 m_time_cpu_assemble += assemble_timer.getElapsed();
955 reorder_timer.Start();
956 graph.secondLevelReordering(m_k, m_numPartitions, secondReorder, secondPerm, m_first_rows_host);
957 reorder_timer.Stop();
958 m_time_reorder += reorder_timer.getElapsed();
960 assemble_timer.Start();
961 graph.assembleBandedMatrix(m_k, m_numPartitions, m_ks_col_host, m_ks_row_host, B,
962 m_ks_host, m_BOffsets_host,
963 typeMap, bandedMatMap);
964 assemble_timer.Stop();
965 m_time_cpu_assemble += assemble_timer.getElapsed();
967 assemble_timer.Start();
968 graph.assembleBandedMatrix(m_k, m_ks_col_host, m_ks_row_host, B, typeMap, bandedMatMap);
969 assemble_timer.Stop();
970 m_time_cpu_assemble += assemble_timer.getElapsed();
973 transfer_timer.Start();
976 m_optReordering = optReordering;
980 m_mc64RowScale = mc64RowScale;
981 m_mc64ColScale = mc64ColScale;
986 if (m_variableBandwidth) {
988 m_offDiagWidths_left = m_offDiagWidths_left_host;
989 m_offDiagWidths_right = m_offDiagWidths_right_host;
990 m_offDiagPerms_left = offDiagPerms_left;
991 m_offDiagPerms_right = offDiagPerms_right;
992 m_BOffsets = m_BOffsets_host;
994 m_spike_ks.resize(m_numPartitions - 1);
995 m_ROffsets.resize(m_numPartitions - 1);
996 m_WVOffsets.resize(m_numPartitions - 1);
997 m_compB2Offsets.resize(m_numPartitions - 1);
998 m_partialBOffsets.resize(m_numPartitions-1);
1000 cusp::blas::fill(m_spike_ks, m_k);
1001 thrust::sequence(m_ROffsets.begin(), m_ROffsets.end(), 0, 4*m_k*m_k);
1002 thrust::sequence(m_WVOffsets.begin(), m_WVOffsets.end(), 0, 2*m_k*m_k);
1003 thrust::sequence(m_partialBOffsets.begin(), m_partialBOffsets.end(), 0, 2 * (m_k+1) * (2*m_k+ 1));
1004 thrust::sequence(m_compB2Offsets.begin(), m_compB2Offsets.end(), 0, 2 * m_k * (2 * m_k + 1));
1007 if (m_variableBandwidth) {
1008 IntVector buffer2(m_n);
1009 m_secondReordering = secondReorder;
1010 combinePermutation(m_secondReordering, m_optReordering, buffer2);
1011 m_optReordering = buffer2;
1013 m_secondPerm = secondPerm;
1014 combinePermutation(m_optPerm, m_secondPerm, buffer2);
1015 m_optPerm = buffer2;
1017 m_first_rows = m_first_rows_host;
1021 IntVector buffer = mc64RowPerm, buffer2(m_n);
1022 combinePermutation(buffer, m_optPerm, buffer2);
1023 m_optPerm = buffer2;
1026 if (m_trackReordering) {
1027 m_offDiagMap = offDiagMap;
1029 m_typeMap = typeMap;
1030 m_bandedMatMap = bandedMatMap;
1031 m_scaleMap = scaleMap;
1034 transfer_timer.Stop();
1035 m_time_transfer += transfer_timer.getElapsed();
1043 template <
typename PrecVector>
1044 template <
typename Matrix>
1046 Precond<PrecVector>::convertToBandedMatrix(
const Matrix& A)
1049 PrecMatrixCoo Acoo = A;
1050 int n = Acoo.num_rows;
1051 int nnz = Acoo.num_entries;
1057 IntVector buffer(nnz);
1058 cusp::blas::axpby(Acoo.row_indices, Acoo.column_indices, buffer, 1, -1);
1059 m_k = cusp::blas::nrmmax(buffer);
1068 int maxNumPartitions = std::max(1, m_n / std::max(m_k + 1, 2 * m_k));
1069 m_numPartitions = std::min(m_numPartitions, maxNumPartitions);
1072 if (m_numPartitions == 1)
1073 m_variableBandwidth =
false;
1076 m_B.resize((2*m_k+1)*n);
1078 int blockX = nnz, gridX = 1, gridY = 1;
1080 dim3 grids(gridX, gridY);
1082 int* d_rows = thrust::raw_pointer_cast(&(Acoo.row_indices[0]));
1083 int* d_cols = thrust::raw_pointer_cast(&(Acoo.column_indices[0]));
1084 PrecValueType* d_vals = thrust::raw_pointer_cast(&(Acoo.values[0]));
1085 PrecValueType* dB = thrust::raw_pointer_cast(&m_B[0]);
1088 device::copyFromCOOMatrixToBandedMatrix<<<grids, blockX>>>(nnz, m_k, d_rows, d_cols, d_vals, dB);
1090 m_time_toBanded = m_timer.getElapsed();
1099 template <
typename PrecVector>
1101 Precond<PrecVector>::extractOffDiagonal(PrecVector& mat_WV)
1104 if (m_variableBandwidth) {
1106 m_offDiags = m_offDiags_host;
1110 m_offDiags.resize(2 * m_k * m_k * (m_numPartitions - 1));
1112 PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[0]);
1113 PrecValueType* p_WV = thrust::raw_pointer_cast(&mat_WV[0]);
1114 PrecValueType* p_offDiags = thrust::raw_pointer_cast(&m_offDiags[0]);
1115 int* p_ks = thrust::raw_pointer_cast(&m_ks[0]);
1117 int partSize = m_n / m_numPartitions;
1118 int remainder = m_n % m_numPartitions;
1120 dim3 grids(m_k, m_numPartitions-1);
1123 device::copydWV_general<PrecValueType><<<grids, 512>>>(m_k, p_B, p_WV, p_offDiags, partSize, m_numPartitions, remainder);
1125 device::copydWV_g32<PrecValueType><<<grids, m_k>>>(m_k, p_B, p_WV, p_offDiags, partSize, m_numPartitions, remainder);
1127 device::copydWV<PrecValueType><<<m_numPartitions-1, m_k*m_k>>>(m_k, p_B, p_WV, p_offDiags, partSize, m_numPartitions, remainder);
1140 template <
typename PrecVector>
1142 Precond<PrecVector>::partFullLU()
1144 if (!m_variableBandwidth)
1151 template <
typename PrecVector>
1153 Precond<PrecVector>::partFullLU_const()
1155 PrecValueType* d_R = thrust::raw_pointer_cast(&m_R[0]);
1156 int two_k = 2 * m_k;
1161 dim3 grids(m_k, m_numPartitions-1);
1164 device::fullLU_sub_spec_general<PrecValueType><<<grids, 512>>>(d_R, two_k, m_k);
1166 device::fullLU_sub_spec<PrecValueType><<<grids, m_k>>>(d_R, two_k, m_k);
1171 if (m_safeFactorization) {
1172 for(
int i = m_k; i < two_k-1; i++) {
1173 int threads = two_k-1-i;
1174 dim3 grids(two_k-1-i, m_numPartitions-1);
1176 if(threads > 1024) {
1177 device::fullLU_div_safe_general<PrecValueType><<<m_numPartitions-1, 512>>>(d_R, m_k, two_k, i);
1178 device::fullLU_sub_general<PrecValueType><<<grids, 512>>>(d_R, m_k, two_k, i);
1180 device::fullLU_div_safe<PrecValueType><<<m_numPartitions-1, threads>>>(d_R, two_k, i);
1181 device::fullLU_sub<PrecValueType><<<grids, threads>>>(d_R, two_k, i);
1185 for(
int i = m_k; i < two_k-1; i++) {
1186 int threads = two_k-1-i;
1187 dim3 grids(two_k-1-i, m_numPartitions-1);
1189 if(threads > 1024) {
1190 device::fullLU_div_general<PrecValueType><<<m_numPartitions-1, 512>>>(d_R, m_k, two_k, i);
1191 device::fullLU_sub_general<PrecValueType><<<grids, 512>>>(d_R, m_k, two_k, i);
1193 device::fullLU_div<PrecValueType><<<m_numPartitions-1, threads>>>(d_R, two_k, i);
1194 device::fullLU_sub<PrecValueType><<<grids, threads>>>(d_R, two_k, i);
1200 template <
typename PrecVector>
1202 Precond<PrecVector>::partFullLU_var()
1204 PrecValueType* d_R = thrust::raw_pointer_cast(&m_R[0]);
1205 int* p_spike_ks = thrust::raw_pointer_cast(&m_spike_ks[0]);
1206 int* p_ROffsets = thrust::raw_pointer_cast(&m_ROffsets[0]);
1208 int two_k = 2 * m_k;
1213 dim3 grids(m_k, m_numPartitions-1);
1216 device::var::fullLU_sub_spec_general<PrecValueType><<<grids, 512>>>(d_R, p_spike_ks, p_ROffsets);
1218 device::var::fullLU_sub_spec<PrecValueType><<<grids, m_k>>>(d_R, p_spike_ks, p_ROffsets);
1223 if (m_safeFactorization) {
1224 for(
int i = m_k; i < two_k-1; i++) {
1225 int threads = two_k-1-i;
1226 dim3 grids(two_k-1-i, m_numPartitions-1);
1228 if(threads > 1024) {
1229 device::var::fullLU_div_safe_general<PrecValueType><<<m_numPartitions-1, 512>>>(d_R, p_spike_ks, p_ROffsets, i);
1230 device::var::fullLU_sub_general<PrecValueType><<<grids, 512>>>(d_R, p_spike_ks, p_ROffsets, i);
1232 device::var::fullLU_div_safe<PrecValueType><<<m_numPartitions-1, threads>>>(d_R, p_spike_ks, p_ROffsets, i);
1233 device::var::fullLU_sub<PrecValueType><<<grids, threads>>>(d_R, p_spike_ks, p_ROffsets, i);
1237 for(
int i = m_k; i < two_k-1; i++) {
1238 int threads = two_k-1-i;
1239 dim3 grids(two_k-1-i, m_numPartitions-1);
1241 if(threads > 1024) {
1242 device::var::fullLU_div_general<PrecValueType><<<m_numPartitions-1, 512>>>(d_R, p_spike_ks, p_ROffsets, i);
1243 device::var::fullLU_sub_general<PrecValueType><<<grids, 512>>>(d_R, p_spike_ks, p_ROffsets, i);
1245 device::var::fullLU_div<PrecValueType><<<m_numPartitions-1, threads>>>(d_R, p_spike_ks, p_ROffsets, i);
1246 device::var::fullLU_sub<PrecValueType><<<grids, threads>>>(d_R, p_spike_ks, p_ROffsets, i);
1252 dim3 grids(m_k-1, m_numPartitions-1);
1254 device::var::fullLU_post_divide_general<PrecValueType><<<grids, 512>>>(d_R, p_spike_ks, p_ROffsets);
1256 device::var::fullLU_post_divide<PrecValueType><<<grids, m_k-1>>>(d_R, p_spike_ks, p_ROffsets);
1267 template <
typename PrecVector>
1269 Precond<PrecVector>::partBandedLU()
1271 if (m_variableBandwidth) {
1277 if (m_numPartitions > 1)
1278 partBandedLU_const();
1284 template <
typename PrecVector>
1286 Precond<PrecVector>::partBandedLU_one()
1291 PrecValueType* dB = thrust::raw_pointer_cast(&m_B[0]);
1293 if (m_ks_col_host.size() != m_n)
1294 m_ks_col_host.resize(m_n, m_k);
1296 if (m_ks_row_host.size() != m_n)
1297 m_ks_row_host.resize(m_n, m_k);
1302 for (
int st_row = 0; st_row < m_n-1; st_row++) {
1303 threadsNum = m_ks_col_host[st_row];
1306 int blockX = m_ks_row_host[st_row];
1309 if (threadsNum > 1024) {
1310 if (m_safeFactorization)
1311 device::bandLU_critical_div_onePart_safe_general<PrecValueType><<<1, 512>>>(dB, st_row, m_k, threadsNum);
1313 device::bandLU_critical_div_onePart_general<PrecValueType><<<threadsNum/512+1, 512>>>(dB, st_row, m_k, threadsNum);
1315 device::bandLU_critical_sub_onePart_general<PrecValueType><<<blockX, 512>>>(dB, st_row, m_k, threadsNum);
1317 if (m_safeFactorization)
1318 device::bandLU_critical_div_onePart_safe<PrecValueType><<<1, threadsNum>>>(dB, st_row, m_k);
1320 device::bandLU_critical_div_onePart<PrecValueType><<<1, threadsNum>>>(dB, st_row, m_k);
1322 device::bandLU_critical_sub_onePart<PrecValueType><<<blockX, threadsNum>>>(dB, st_row, m_k);
1325 }
else if (m_k > 27) {
1326 if (m_safeFactorization)
1327 device::bandLU_g32_safe<PrecValueType><<<1, 512>>>(dB, m_k, m_n, 0);
1329 device::bandLU_g32<PrecValueType><<<1, 512>>>(dB, m_k, m_n, 0);
1331 if (m_safeFactorization)
1332 device::bandLU_safe<PrecValueType><<<1, m_k * m_k>>>(dB, m_k, m_n, 0);
1334 device::bandLU<PrecValueType><<<1, m_k * m_k>>>(dB, m_k, m_n, 0);
1339 if (m_safeFactorization)
1340 device::boostLastPivot<PrecValueType><<<1, 1>>>(dB, m_n, m_k, m_n, 0);
1345 if (!m_safeFactorization && hasZeroPivots(m_B.begin(), m_B.end(), m_k, (PrecValueType)
BURST_VALUE))
1349 int gridX = m_n, gridY = 1;
1351 dim3 grids(gridX, gridY);
1353 device::bandLU_post_divide_general<PrecValueType><<<grids, 512>>>(dB, m_k, m_n);
1355 device::bandLU_post_divide<PrecValueType><<<grids, m_k>>>(dB, m_k, m_n);
1358 template <
typename PrecVector>
1360 Precond<PrecVector>::partBandedLU_const()
1367 PrecValueType* dB = thrust::raw_pointer_cast(&m_B[0]);
1370 int numPart_eff = m_numPartitions;
1372 if (m_factMethod ==
LU_UL && m_numPartitions > 1 && m_precondType !=
Block) {
1373 n_eff -= m_n / m_numPartitions;
1377 int partSize = n_eff / numPart_eff;
1378 int remainder = n_eff % numPart_eff;
1381 int final_partition_size = partSize + 1;
1384 for (
int st_row = 0; st_row < final_partition_size - 1; st_row++) {
1386 if (remainder == 0)
continue;
1388 if (threadsNum > final_partition_size - 1)
1389 threadsNum = final_partition_size - 1;
1390 if (threadsNum > 1024) {
1391 if (m_safeFactorization)
1392 device::bandLU_critical_div_safe_general<PrecValueType><<<remainder, 512>>>(dB, st_row, m_k, partSize, remainder);
1394 device::bandLU_critical_div_general<PrecValueType><<<remainder, 512>>>(dB, st_row, m_k, partSize, remainder);
1395 dim3 tmpGrids(threadsNum, remainder);
1396 device::bandLU_critical_sub_general<PrecValueType><<<tmpGrids, 512>>>(dB, st_row, m_k, partSize, remainder);
1398 if (m_safeFactorization)
1399 device::bandLU_critical_div_safe<PrecValueType><<<remainder, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1401 device::bandLU_critical_div<PrecValueType><<<remainder, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1403 dim3 tmpGrids(threadsNum, remainder);
1404 device::bandLU_critical_sub<PrecValueType><<<tmpGrids, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1408 if (threadsNum > final_partition_size - st_row - 1)
1409 threadsNum = final_partition_size - st_row - 1;
1410 if (threadsNum > 1024) {
1411 if (m_safeFactorization)
1412 device::bandLU_critical_div_safe_general<PrecValueType><<<numPart_eff, 512>>>(dB, st_row, m_k, partSize, remainder);
1414 device::bandLU_critical_div_general<PrecValueType><<<numPart_eff, 512>>>(dB, st_row, m_k, partSize, remainder);
1416 dim3 tmpGrids(threadsNum, numPart_eff);
1417 device::bandLU_critical_sub_general<PrecValueType><<<tmpGrids, 512>>>(dB, st_row, m_k, partSize, remainder);
1419 if (m_safeFactorization)
1420 device::bandLU_critical_div_safe<PrecValueType><<<numPart_eff, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1422 device::bandLU_critical_div<PrecValueType><<<numPart_eff, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1424 dim3 tmpGrids(threadsNum, numPart_eff);
1425 device::bandLU_critical_sub<PrecValueType><<<tmpGrids, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1429 }
else if (m_k > 27) {
1430 if (m_safeFactorization)
1431 device::bandLU_g32_safe<PrecValueType><<<numPart_eff, 512>>>(dB, m_k, partSize, remainder);
1433 device::bandLU_g32<PrecValueType><<<numPart_eff, 512>>>(dB, m_k, partSize, remainder);
1435 if (m_safeFactorization)
1436 device::bandLU_safe<PrecValueType><<<numPart_eff, m_k * m_k>>>(dB, m_k, partSize, remainder);
1438 device::bandLU<PrecValueType><<<numPart_eff, m_k * m_k>>>(dB, m_k, partSize, remainder);
1446 if (!m_safeFactorization && hasZeroPivots(m_B.begin(), m_B.begin() + n_eff * (2*m_k+1), m_k, (PrecValueType)
BURST_VALUE))
1447 throw system_error(system_error::Zero_pivoting, "Found a pivot equal to zero (partBandedLU_const).");
1450 if (m_numPartitions == 1) {
1454 dim3 grids(gridX, gridY);
1456 device::bandLU_post_divide_general<PrecValueType><<<grids, 512>>>(dB, m_k, m_n);
1458 device::bandLU_post_divide<PrecValueType><<<grids, m_k>>>(dB, m_k, m_n);
1463 template <
typename PrecVector>
1465 Precond<PrecVector>::partBandedLU_var()
1471 PrecValueType* dB = thrust::raw_pointer_cast(&m_B[0]);
1472 int* p_ks = thrust::raw_pointer_cast(&m_ks[0]);
1473 int* p_BOffsets = thrust::raw_pointer_cast(&m_BOffsets[0]);
1475 int tmp_k = cusp::blas::nrmmax(m_ks);
1476 int partSize = m_n / m_numPartitions;
1477 int remainder = m_n % m_numPartitions;
1480 int final_partition_size = partSize + 1;
1482 int threadsNum = adjustNumThreads(cusp::blas::nrm1(m_ks) / m_numPartitions);
1485 for (
int st_row = 0; st_row < final_partition_size - 1; st_row++) {
1487 if (remainder == 0)
continue;
1489 blockY = m_ks_row_host[st_row];
1490 last = m_ks_col_host[st_row];
1491 int corres_row = st_row;
1492 for (
int i = 1; i < remainder; i++) {
1493 corres_row += partSize + 1;
1494 if (blockY < m_ks_row_host[corres_row])
1495 blockY = m_ks_row_host[corres_row];
1496 if (last < m_ks_col_host[corres_row])
1497 last = m_ks_col_host[corres_row];
1500 if (m_safeFactorization)
1501 device::var::bandLU_critical_div_safe_general<PrecValueType><<<remainder, threadsNum>>>(dB, st_row, p_ks, p_BOffsets, partSize, remainder);
1503 device::var::bandLU_critical_div_general<PrecValueType><<<remainder, threadsNum>>>(dB, st_row, p_ks, p_BOffsets, partSize, remainder);
1505 dim3 tmpGrids(blockY, remainder);
1506 device::var::bandLU_critical_sub_general<PrecValueType><<<tmpGrids, threadsNum>>>(dB, st_row, p_ks, p_BOffsets, partSize, remainder, last);
1510 int corres_row = st_row;
1511 for (
int i = 0; i < remainder; i++) {
1512 if (blockY < m_ks_row_host[corres_row])
1513 blockY = m_ks_row_host[corres_row];
1514 if (last < m_ks_col_host[corres_row])
1515 last = m_ks_col_host[corres_row];
1516 corres_row += partSize + 1;
1519 for (
int i = remainder; i < m_numPartitions; i++) {
1520 if (blockY < m_ks_row_host[corres_row])
1521 blockY = m_ks_row_host[corres_row];
1522 if (last < m_ks_col_host[corres_row])
1523 last = m_ks_col_host[corres_row];
1524 corres_row += partSize;
1527 if (m_safeFactorization)
1528 device::var::bandLU_critical_div_safe_general<PrecValueType><<<m_numPartitions, threadsNum>>>(dB, st_row, p_ks, p_BOffsets, partSize, remainder);
1530 device::var::bandLU_critical_div_general<PrecValueType><<<m_numPartitions, threadsNum>>>(dB, st_row, p_ks, p_BOffsets, partSize, remainder);
1532 dim3 tmpGrids(blockY, m_numPartitions);
1533 device::var::bandLU_critical_sub_general<PrecValueType><<<tmpGrids, threadsNum>>>(dB, st_row, p_ks, p_BOffsets, partSize, remainder, last);
1536 }
else if (tmp_k > 27){
1537 if (m_safeFactorization)
1538 device::var::bandLU_g32_safe<PrecValueType><<<m_numPartitions, 512>>>(dB, p_ks, p_BOffsets, partSize, remainder);
1540 device::var::bandLU_g32<PrecValueType><<<m_numPartitions, 512>>>(dB, p_ks, p_BOffsets, partSize, remainder);
1542 if (m_safeFactorization)
1543 device::var::bandLU_safe<PrecValueType><<<m_numPartitions, tmp_k * tmp_k >>>(dB, p_ks, p_BOffsets, partSize, remainder);
1545 device::var::bandLU<PrecValueType><<<m_numPartitions, tmp_k * tmp_k>>>(dB, p_ks, p_BOffsets, partSize, remainder);
1549 if (m_safeFactorization)
1550 device::var::boostLastPivot<PrecValueType><<<m_numPartitions, 1>>>(dB, partSize, p_ks, p_BOffsets, partSize, remainder);
1555 if (!m_safeFactorization) {
1556 for (
int i = 0; i < m_numPartitions; i++) {
1557 if (hasZeroPivots(m_B.begin() + m_BOffsets_host[i], m_B.begin() + m_BOffsets_host[i+1], m_ks_host[i], (PrecValueType)
BURST_VALUE))
1563 int gridX = partSize+1;
1566 dim3 grids(gridX, gridY);
1568 for (
int i=0; i<m_numPartitions ; i++) {
1569 if (i < remainder) {
1570 if (m_ks_host[i] <= 1024)
1571 device::var::bandLU_post_divide_per_partition<PrecValueType><<<grids, m_ks_host[i]>>>(dB, m_ks_host[i], m_BOffsets_host[i], partSize + 1);
1573 device::var::bandLU_post_divide_per_partition_general<PrecValueType><<<grids, 512>>>(dB, m_ks_host[i], m_BOffsets_host[i], partSize + 1);
1576 if (m_ks_host[i] <= 1024)
1577 device::var::bandLU_post_divide_per_partition<PrecValueType><<<grids, m_ks_host[i]>>>(dB, m_ks_host[i], m_BOffsets_host[i], partSize);
1579 device::var::bandLU_post_divide_per_partition_general<PrecValueType><<<grids, 512>>>(dB, m_ks_host[i], m_BOffsets_host[i], partSize);
1589 template <
typename PrecVector>
1591 Precond<PrecVector>::partBandedUL(PrecVector& B)
1600 int partSize = m_n / m_numPartitions;
1601 int remainder = m_n % m_numPartitions;
1602 int n_first = (remainder == 0 ? partSize : (partSize + 1));
1604 PrecValueType* dB = thrust::raw_pointer_cast(&B[(2 * m_k + 1) * n_first]);
1606 int n_eff = m_n - n_first;
1607 int numPart_eff = m_numPartitions - 1;
1609 partSize = n_eff / numPart_eff;
1610 remainder = n_eff % numPart_eff;
1613 int n_final = partSize + 1;
1615 for (
int st_row = n_final - 1; st_row > 0; st_row--) {
1616 if (st_row == n_final - 1) {
1617 if (remainder == 0)
continue;
1620 threadsNum = st_row;
1621 dim3 tmpGrids(threadsNum, remainder);
1622 if (threadsNum > 1024) {
1623 if (m_safeFactorization)
1624 device::bandUL_critical_div_safe_general<PrecValueType><<<remainder, 512>>>(dB, st_row, m_k, partSize, remainder);
1626 device::bandUL_critical_div_general<PrecValueType><<<remainder, 512>>>(dB, st_row, m_k, partSize, remainder);
1627 device::bandUL_critical_sub_general<PrecValueType><<<tmpGrids, 512>>>(dB, st_row, m_k, partSize, remainder);
1629 if (m_safeFactorization)
1630 device::bandUL_critical_div_safe<PrecValueType><<<remainder, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1632 device::bandUL_critical_div<PrecValueType><<<remainder, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1633 device::bandUL_critical_sub<PrecValueType><<<tmpGrids, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1638 threadsNum = st_row;
1639 dim3 tmpGrids(threadsNum, numPart_eff);
1640 if(threadsNum > 1024) {
1641 if (m_safeFactorization)
1642 device::bandUL_critical_div_safe_general<PrecValueType> <<<numPart_eff, 512>>>(dB, st_row, m_k, partSize, remainder);
1644 device::bandUL_critical_div_general<PrecValueType> <<<numPart_eff, 512>>>(dB, st_row, m_k, partSize, remainder);
1645 device::bandUL_critical_sub_general<PrecValueType> <<<tmpGrids, 512>>>(dB, st_row, m_k, partSize, remainder);
1647 if (m_safeFactorization)
1648 device::bandUL_critical_div_safe<PrecValueType> <<<numPart_eff, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1650 device::bandUL_critical_div<PrecValueType> <<<numPart_eff, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1651 device::bandUL_critical_sub<PrecValueType> <<<tmpGrids, threadsNum>>>(dB, st_row, m_k, partSize, remainder);
1655 }
else if (m_k > 27) {
1656 if (m_safeFactorization)
1657 device::bandUL_g32_safe<PrecValueType><<<numPart_eff, 512>>>(dB, m_k, partSize, remainder);
1659 device::bandUL_g32<PrecValueType><<<numPart_eff, 512>>>(dB, m_k, partSize, remainder);
1661 if (m_safeFactorization)
1662 device::bandUL_safe<PrecValueType><<<numPart_eff, m_k * m_k>>>(dB, m_k, partSize, remainder);
1664 device::bandUL<PrecValueType><<<numPart_eff, m_k * m_k>>>(dB, m_k, partSize, remainder);
1671 if (!m_safeFactorization && hasZeroPivots(B.begin() + (2 * m_k + 1) * n_first, B.end(), m_k, (PrecValueType)
BURST_VALUE))
1682 template <
typename PrecVector>
1684 Precond<PrecVector>::partBandedFwdSweep(PrecVector& v)
1686 if (!m_variableBandwidth)
1687 partBandedFwdSweep_const(v);
1689 partBandedFwdSweep_var(v);
1692 template <
typename PrecVector>
1694 Precond<PrecVector>::partBandedFwdSweep_const(PrecVector& v)
1696 PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[0]);
1697 PrecValueType* p_v = thrust::raw_pointer_cast(&v[0]);
1699 int partSize = m_n / m_numPartitions;
1700 int remainder = m_n % m_numPartitions;
1702 if (m_precondType ==
Block || m_factMethod ==
LU_only || m_numPartitions == 1) {
1704 device::forwardElimL_general<PrecValueType><<<m_numPartitions, 512>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1706 device::forwardElimL_g32<PrecValueType><<<m_numPartitions, m_k>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1708 device::forwardElimL<PrecValueType><<<m_numPartitions, m_k>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1711 device::forwardElimL_LU_UL_general<PrecValueType><<<m_numPartitions, 512>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1713 device::forwardElimL_LU_UL_g32<PrecValueType><<<m_numPartitions, m_k>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1715 device::forwardElimL_LU_UL<PrecValueType><<<m_numPartitions, m_k>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1719 template <
typename PrecVector>
1721 Precond<PrecVector>::partBandedFwdSweep_var(PrecVector& v)
1723 PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[0]);
1724 PrecValueType* p_v = thrust::raw_pointer_cast(&v[0]);
1725 int* p_ks = thrust::raw_pointer_cast(&m_ks[0]);
1726 int* p_BOffsets = thrust::raw_pointer_cast(&m_BOffsets[0]);
1728 int tmp_k = cusp::blas::nrmmax(m_ks);
1729 int partSize = m_n / m_numPartitions;
1730 int remainder = m_n % m_numPartitions;
1733 device::var::fwdElim_sol<PrecValueType><<<m_numPartitions, 512>>>(m_n, p_ks, p_BOffsets, p_B, p_v, partSize, remainder);
1734 else if (tmp_k > 32)
1735 device::var::fwdElim_sol_medium<PrecValueType><<<m_numPartitions, tmp_k>>>(m_n, p_ks, p_BOffsets, p_B, p_v, partSize, remainder);
1737 device::var::fwdElim_sol_narrow<PrecValueType><<<m_numPartitions, tmp_k>>>(m_n, p_ks, p_BOffsets, p_B, p_v, partSize, remainder);
1746 template <
typename PrecVector>
1748 Precond<PrecVector>::partBandedBckSweep(PrecVector& v)
1750 if (!m_variableBandwidth)
1751 partBandedBckSweep_const(v);
1753 partBandedBckSweep_var(v);
1756 template <
typename PrecVector>
1758 Precond<PrecVector>::partBandedBckSweep_const(PrecVector& v)
1760 PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[0]);
1761 PrecValueType* p_v = thrust::raw_pointer_cast(&v[0]);
1763 int partSize = m_n / m_numPartitions;
1764 int remainder = m_n % m_numPartitions;
1766 if (m_precondType ==
Block || m_factMethod ==
LU_only || m_numPartitions == 1) {
1767 if (m_numPartitions > 1) {
1769 device::backwardElimU_general<PrecValueType><<<m_numPartitions, 512>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1771 device::backwardElimU_g32<PrecValueType><<<m_numPartitions, m_k>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1773 device::backwardElimU<PrecValueType><<<m_numPartitions, m_k>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1781 dim3 grids(gridX, m_numPartitions);
1783 device::preBck_sol_divide<PrecValueType><<<grids, blockX>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1786 device::bckElim_sol<PrecValueType><<<m_numPartitions, 512>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1788 device::bckElim_sol_medium<PrecValueType><<<m_numPartitions, m_k>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1790 device::bckElim_sol_narrow<PrecValueType><<<m_numPartitions, m_k>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1794 device::backwardElimU_LU_UL_general<PrecValueType><<<m_numPartitions, 512>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1796 device::backwardElimU_LU_UL_g32<PrecValueType><<<m_numPartitions, m_k>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1798 device::backwardElimU_LU_UL<PrecValueType><<<m_numPartitions, m_k>>>(m_n, m_k, p_B, p_v, partSize, remainder);
1802 template <
typename PrecVector>
1804 Precond<PrecVector>::partBandedBckSweep_var(PrecVector& v)
1806 PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[0]);
1807 PrecValueType* p_v = thrust::raw_pointer_cast(&v[0]);
1808 int* p_ks = thrust::raw_pointer_cast(&m_ks[0]);
1809 int* p_BOffsets = thrust::raw_pointer_cast(&m_BOffsets[0]);
1811 int tmp_k = cusp::blas::nrmmax(m_ks);
1812 int partSize = m_n / m_numPartitions;
1813 int remainder = m_n % m_numPartitions;
1815 int gridX = 1, blockX = partSize + 1;
1817 dim3 grids(gridX, m_numPartitions);
1818 device::var::preBck_sol_divide<PrecValueType><<<grids, blockX>>>(m_n, p_ks, p_BOffsets, p_B, p_v, partSize, remainder);
1821 device::var::bckElim_sol<PrecValueType><<<m_numPartitions, 512>>>(m_n, p_ks, p_BOffsets, p_B, p_v, partSize, remainder);
1822 else if (tmp_k > 32)
1823 device::var::bckElim_sol_medium<PrecValueType><<<m_numPartitions, tmp_k>>>(m_n, p_ks, p_BOffsets, p_B, p_v, partSize, remainder);
1825 device::var::bckElim_sol_narrow<PrecValueType><<<m_numPartitions, tmp_k>>>(m_n, p_ks, p_BOffsets, p_B, p_v, partSize, remainder);
1832 template <
typename PrecVector>
1834 Precond<PrecVector>::partFullFwdSweep(PrecVector& v)
1836 PrecValueType* p_R = thrust::raw_pointer_cast(&m_R[0]);
1837 PrecValueType* p_v = thrust::raw_pointer_cast(&v[0]);
1839 int partSize = m_n / m_numPartitions;
1840 int remainder = m_n % m_numPartitions;
1842 dim3 grids(m_numPartitions-1, 1);
1844 if (!m_variableBandwidth) {
1846 device::forwardElimLNormal_g512<PrecValueType><<<grids, 512>>>(m_n, m_k, 2*m_k, p_R, p_v, partSize, remainder);
1848 device::forwardElimLNormal<PrecValueType><<<grids, 2*m_k-1>>>(m_n, m_k, 2*m_k, p_R, p_v, partSize, remainder);
1850 int* p_ROffsets = thrust::raw_pointer_cast(&m_ROffsets[0]);
1851 int* p_spike_ks = thrust::raw_pointer_cast(&m_spike_ks[0]);
1854 device::var::fwdElim_full<PrecValueType><<<grids, 512>>>(m_n, p_spike_ks, p_ROffsets, p_R, p_v, partSize, remainder);
1856 device::var::fwdElim_full_narrow<PrecValueType><<<grids, m_k>>>(m_n, p_spike_ks, p_ROffsets, p_R, p_v, partSize, remainder);
1865 template <
typename PrecVector>
1867 Precond<PrecVector>::partFullBckSweep(PrecVector& v)
1869 PrecValueType* p_R = thrust::raw_pointer_cast(&m_R[0]);
1870 PrecValueType* p_v = thrust::raw_pointer_cast(&v[0]);
1872 int partSize = m_n / m_numPartitions;
1873 int remainder = m_n % m_numPartitions;
1875 dim3 grids(m_numPartitions-1, 1);
1877 if (!m_variableBandwidth) {
1879 device::backwardElimUNormal_g512<PrecValueType><<<grids, 512>>>(m_n, m_k, 2*m_k, p_R, p_v, partSize, remainder);
1881 device::backwardElimUNormal<PrecValueType><<<grids, 2*m_k-1>>>(m_n, m_k, 2*m_k, p_R, p_v, partSize, remainder);
1883 int* p_ROffsets = thrust::raw_pointer_cast(&m_ROffsets[0]);
1884 int* p_spike_ks = thrust::raw_pointer_cast(&m_spike_ks[0]);
1887 device::var::preBck_full_divide<PrecValueType><<<m_numPartitions-1, 512>>>(m_n, p_spike_ks, p_ROffsets, p_R, p_v, partSize, remainder);
1888 device::var::bckElim_full<PrecValueType><<<grids, 512>>>(m_n, p_spike_ks, p_ROffsets, p_R, p_v, partSize, remainder);
1891 device::var::preBck_full_divide_narrow<PrecValueType><<<m_numPartitions-1, m_k>>>(m_n, p_spike_ks, p_ROffsets, p_R, p_v, partSize, remainder);
1892 device::var::bckElim_full_narrow<PrecValueType><<<grids, 2*m_k-1>>>(m_n, p_spike_ks, p_ROffsets, p_R, p_v, partSize, remainder);
1902 template <
typename PrecVector>
1904 Precond<PrecVector>::purifyRHS(PrecVector& v,
1907 PrecValueType* p_offDiags = thrust::raw_pointer_cast(&m_offDiags[0]);
1908 PrecValueType* p_v = thrust::raw_pointer_cast(&v[0]);
1909 PrecValueType* p_res = thrust::raw_pointer_cast(&res[0]);
1911 int partSize = m_n / m_numPartitions;
1912 int remainder = m_n % m_numPartitions;
1914 dim3 grids(m_k, m_numPartitions-1);
1916 if (!m_variableBandwidth) {
1918 device::innerProductBCX_g256<PrecValueType><<<grids, 256>>>(p_offDiags, p_v, p_res, m_n, m_k, partSize, m_numPartitions, remainder);
1920 device::innerProductBCX_g64<PrecValueType><<<grids, 256>>>(p_offDiags, p_v, p_res, m_n, m_k, partSize, m_numPartitions, remainder);
1922 device::innerProductBCX_g32<PrecValueType><<<grids, 64>>>(p_offDiags, p_v, p_res, m_n, m_k, partSize, m_numPartitions, remainder);
1924 device::innerProductBCX<PrecValueType><<<grids, 32>>>(p_offDiags, p_v, p_res, m_n, m_k, partSize, m_numPartitions, remainder);
1926 int* p_WVOffsets = thrust::raw_pointer_cast(&m_WVOffsets[0]);
1927 int* p_spike_ks = thrust::raw_pointer_cast(&m_spike_ks[0]);
1930 device::innerProductBCX_var_bandwidth_g256<PrecValueType><<<grids, 256>>>(p_offDiags, p_v, p_res, m_n, p_spike_ks, p_WVOffsets, partSize, m_numPartitions, remainder);
1932 device::innerProductBCX_var_bandwidth_g64<PrecValueType><<<grids, 256>>>(p_offDiags, p_v, p_res, m_n, p_spike_ks, p_WVOffsets, partSize, m_numPartitions, remainder);
1934 device::innerProductBCX_var_bandwidth_g32<PrecValueType><<<grids, 64>>>(p_offDiags, p_v, p_res, m_n, p_spike_ks, p_WVOffsets, partSize, m_numPartitions, remainder);
1936 device::innerProductBCX_var_bandwidth<PrecValueType><<<grids, 32>>>(p_offDiags, p_v, p_res, m_n, p_spike_ks, p_WVOffsets, partSize, m_numPartitions, remainder);
1945 template <
typename PrecVector>
1947 Precond<PrecVector>::calculateSpikes(PrecVector& WV)
1949 if (!m_variableBandwidth) {
1950 calculateSpikes_const(WV);
1954 int totalRHSCount = cusp::blas::nrm1(m_offDiagWidths_right_host) + cusp::blas::nrm1(m_offDiagWidths_left_host);
1955 if (totalRHSCount >= 2800) {
1956 calculateSpikes_var(WV);
1960 calculateSpikes_var_old(WV);
1963 template <
typename PrecVector>
1965 Precond<PrecVector>::calculateSpikes_var_old(PrecVector& WV)
1967 PrecVector WV_spare(m_k*m_k);
1969 PrecValueType* p_WV = thrust::raw_pointer_cast(&WV[0]);
1970 PrecValueType* p_WV_spare = thrust::raw_pointer_cast(&WV_spare[0]);
1973 int last_partition_size = m_n / m_numPartitions;
1974 int first_partition_size = last_partition_size;
1975 int numThreadsToUse = adjustNumThreads(cusp::blas::nrm1(m_ks) / m_numPartitions);
1977 if (m_n % m_numPartitions != 0)
1978 first_partition_size++;
1986 int n_eff = m_n - last_partition_size;
1987 int numPart_eff = m_numPartitions - 1;
1988 int partSize = n_eff / numPart_eff;
1989 int remainder = n_eff % numPart_eff;
1991 const int BUF_FACTOR = 16;
1993 PrecVector extV(m_k * n_eff, (PrecValueType)0), buffer;
1995 PrecValueType* p_extV = thrust::raw_pointer_cast(&extV[0]);
1996 PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[0]);
1997 int* p_secondReordering = thrust::raw_pointer_cast(&m_secondReordering[0]);
1998 int* p_secondPerm = thrust::raw_pointer_cast(&m_secondPerm[0]);
2000 dim3 gridsCopy(m_k, numPart_eff);
2001 dim3 gridsSweep(numPart_eff, m_k);
2003 int* p_ks = thrust::raw_pointer_cast(&m_ks[0]);
2004 int* p_offDiagWidths_right = thrust::raw_pointer_cast(&m_offDiagWidths_right[0]);
2005 int* p_offDiagPerms_right = thrust::raw_pointer_cast(&m_offDiagPerms_right[0]);
2006 int* p_first_rows = thrust::raw_pointer_cast(&m_first_rows[0]);
2007 int* p_offsets = thrust::raw_pointer_cast(&m_BOffsets[0]);
2009 int permuteBlockX = n_eff;
2010 int permuteGridX = 1;
2012 dim3 gridsPermute(permuteGridX, m_k);
2015 device::copyWVFromOrToExtendedV_general<PrecValueType><<<gridsCopy, numThreadsToUse>>>(n_eff, m_k, partSize, remainder, p_WV, p_extV,
false);
2016 buffer.resize((m_k - (BUF_FACTOR-1) * (m_k / BUF_FACTOR)) * n_eff);
2018 PrecValueType* p_buffer = thrust::raw_pointer_cast(&buffer[0]);
2020 for (
int i=0; i<BUF_FACTOR; i++) {
2021 gridsPermute.y = m_k / BUF_FACTOR;
2022 if (i == BUF_FACTOR - 1)
2023 gridsPermute.y = m_k - (BUF_FACTOR-1) * (m_k/BUF_FACTOR);
2024 device::permute<PrecValueType><<<gridsPermute, permuteBlockX>>>(n_eff, p_extV+(i*(m_k/BUF_FACTOR)*n_eff), p_buffer, p_secondPerm);
2025 thrust::copy(buffer.begin(), buffer.begin()+(gridsPermute.y * n_eff), extV.begin()+i*(m_k/BUF_FACTOR)*n_eff);
2029 int last_row = 0, pseudo_first_row = 0;
2030 for (
int i=0; i<numPart_eff; i++) {
2032 last_row += partSize + 1;
2034 last_row += partSize;
2036 int tmp_first_row = m_first_rows_host[i];
2037 device::var::fwdElim_rightSpike_per_partition<PrecValueType><<<m_offDiagWidths_right_host[i], numThreadsToUse>>> (n_eff, m_ks_host[i], m_BOffsets_host[i]+m_ks_host[i]+(2*m_ks_host[i]+1)*(m_first_rows_host[i]-pseudo_first_row), p_B, p_extV, m_first_rows_host[i], last_row);
2039 int blockX = last_row - m_first_rows_host[i];
2042 dim3 grids(gridX, m_offDiagWidths_right_host[i]);
2043 device::var::preBck_rightSpike_divide_per_partition<PrecValueType><<<grids, blockX>>> (n_eff, m_ks_host[i], m_BOffsets_host[i]+m_ks_host[i]+(2*m_ks_host[i]+1)*(m_first_rows_host[i]-pseudo_first_row), p_B, p_extV, m_first_rows_host[i], last_row);
2045 m_first_rows_host[i] = thrust::reduce(m_secondPerm.begin()+(last_row-m_k), m_secondPerm.begin()+last_row, last_row, thrust::minimum<int>());
2046 device::var::bckElim_rightSpike_per_partition<PrecValueType><<<m_offDiagWidths_right_host[i], numThreadsToUse>>> (n_eff, m_ks_host[i], m_BOffsets_host[i]+m_ks_host[i]+(2*m_ks_host[i]+1)*(last_row-pseudo_first_row-1), p_B, p_extV, m_first_rows_host[i], last_row);
2048 pseudo_first_row = last_row;
2052 for (
int i=0; i<BUF_FACTOR; i++) {
2053 gridsPermute.y = m_k / BUF_FACTOR;
2054 if (i == BUF_FACTOR - 1)
2055 gridsPermute.y = m_k - (BUF_FACTOR-1) * (m_k/BUF_FACTOR);
2056 device::permute<PrecValueType><<<gridsPermute, permuteBlockX>>>(n_eff, p_extV+(i*(m_k/BUF_FACTOR)*n_eff), p_buffer, p_secondReordering);
2057 thrust::copy(buffer.begin(), buffer.begin()+(gridsPermute.y * n_eff), extV.begin()+i*(m_k/BUF_FACTOR)*n_eff);
2060 device::copyWVFromOrToExtendedV_general<PrecValueType><<<gridsCopy, numThreadsToUse>>>(n_eff, m_k, partSize, remainder, p_WV, p_extV,
true);
2062 for (
int i=0; i<numPart_eff; i++) {
2063 cusp::blas::fill(WV_spare, (PrecValueType) 0);
2064 device::matrixVReordering_perPartition<PrecValueType><<<m_offDiagWidths_right_host[i], numThreadsToUse>>>(m_k, p_WV+2*i*m_k*m_k, p_WV_spare, p_offDiagPerms_right+i*m_k);
2065 thrust::copy(WV_spare.begin(), WV_spare.end(), WV.begin() + (2*i*m_k*m_k));
2075 int n_eff = m_n - first_partition_size;
2076 int numPart_eff = m_numPartitions - 1;
2077 int partSize = n_eff / numPart_eff;
2078 int remainder = n_eff % numPart_eff;
2080 const int BUF_FACTOR = 16;
2082 PrecVector extW(m_k * n_eff, (PrecValueType)0), buffer;
2084 PrecValueType* p_extW = thrust::raw_pointer_cast(&extW[0]);
2085 PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[0]);
2087 dim3 gridsSweep(numPart_eff, m_k);
2088 dim3 gridsCopy(m_k, numPart_eff);
2090 int* p_ks = thrust::raw_pointer_cast(&m_ks[1]);
2091 int* p_offDiagWidths_left = thrust::raw_pointer_cast(&m_offDiagWidths_left[0]);
2092 int* p_offDiagPerms_left = thrust::raw_pointer_cast(&m_offDiagPerms_left[0]);
2094 IntVector tmp_offsets;
2095 IntVector tmp_secondReordering(m_n, first_partition_size);
2096 IntVector tmp_secondPerm(m_n, first_partition_size);
2098 cusp::blas::axpby(m_secondReordering, tmp_secondReordering, tmp_secondReordering, 1.0, -1.0);
2099 cusp::blas::axpby(m_secondPerm, tmp_secondPerm, tmp_secondPerm, 1.0, -1.0);
2101 int* p_secondReordering = thrust::raw_pointer_cast(&tmp_secondReordering[first_partition_size]);
2102 int* p_secondPerm = thrust::raw_pointer_cast(&tmp_secondPerm[first_partition_size]);
2106 for (
int i = m_numPartitions-1; i >= 1; i--)
2107 tmp_offsets_host[i] -= tmp_offsets_host[1];
2108 tmp_offsets = tmp_offsets_host;
2111 int* p_offsets = thrust::raw_pointer_cast(&tmp_offsets[1]);
2113 int permuteBlockX = n_eff;
2114 int permuteGridX = 1;
2116 dim3 gridsPermute(permuteGridX, m_k);
2119 device::copyWVFromOrToExtendedW_general<PrecValueType><<<gridsCopy, numThreadsToUse>>>(n_eff, m_k, partSize, remainder, p_WV, p_extW,
false);
2120 buffer.resize((m_k - (BUF_FACTOR-1) * (m_k / BUF_FACTOR)) * n_eff);
2121 PrecValueType* p_buffer = thrust::raw_pointer_cast(&buffer[0]);
2123 for (
int i = 0; i < BUF_FACTOR; i++) {
2124 gridsPermute.y = m_k / BUF_FACTOR;
2125 if (i == BUF_FACTOR - 1)
2126 gridsPermute.y = m_k - (BUF_FACTOR-1) * (m_k/BUF_FACTOR);
2127 device::permute<PrecValueType><<<gridsPermute, permuteBlockX>>>(n_eff, p_extW+i*(m_k/BUF_FACTOR)*n_eff, p_buffer, p_secondPerm);
2128 thrust::copy(buffer.begin(), buffer.begin()+(gridsPermute.y * n_eff), extW.begin()+i*(m_k/BUF_FACTOR)*n_eff);
2134 for (
int i = 0; i < numPart_eff; i++) {
2136 last_row += partSize + 1;
2138 last_row += partSize;
2139 device::var::fwdElim_leftSpike_per_partition<PrecValueType><<<m_offDiagWidths_left_host[i], numThreadsToUse>>> (n_eff, m_ks_host[i+1], m_k - m_offDiagWidths_left_host[i], m_BOffsets_host[i+1]+m_ks_host[i+1], p_B, p_extW, first_row, last_row);
2141 int blockX = last_row - first_row;
2144 dim3 grids(gridX, m_offDiagWidths_left_host[i]);
2146 device::var::preBck_leftSpike_divide_per_partition<PrecValueType><<<grids, blockX>>> (n_eff, m_ks_host[i+1], m_k - m_offDiagWidths_left_host[i], m_BOffsets_host[i+1]+m_ks_host[i+1], p_B, p_extW, first_row, last_row);
2147 device::var::bckElim_leftSpike_per_partition<PrecValueType><<<m_offDiagWidths_left_host[i], numThreadsToUse>>>(n_eff, m_ks_host[i+1], m_k - m_offDiagWidths_left_host[i], m_BOffsets_host[i+1] + m_ks_host[i+1] + (2*m_ks_host[i+1]+1)*(last_row-first_row-1), p_B, p_extW, first_row, last_row);
2148 first_row = last_row;
2152 for (
int i = 0; i < BUF_FACTOR; i++) {
2153 gridsPermute.y = m_k / BUF_FACTOR;
2154 if (i == BUF_FACTOR - 1)
2155 gridsPermute.y = m_k - (BUF_FACTOR-1) * (m_k/BUF_FACTOR);
2157 device::permute<PrecValueType><<<gridsPermute, permuteBlockX>>>(n_eff, p_extW+i*(m_k/BUF_FACTOR)*n_eff, p_buffer, p_secondReordering);
2158 thrust::copy(buffer.begin(), buffer.begin()+(gridsPermute.y * n_eff), extW.begin()+i*(m_k/BUF_FACTOR)*n_eff);
2161 device::copyWVFromOrToExtendedW_general<PrecValueType><<<gridsCopy, numThreadsToUse>>>(n_eff, m_k, partSize, remainder, p_WV, p_extW,
true);
2164 for (
int i = 0; i < numPart_eff; i++) {
2165 cusp::blas::fill(WV_spare, (PrecValueType) 0);
2166 device::matrixWReordering_perPartition<PrecValueType><<<m_offDiagWidths_left_host[i], numThreadsToUse>>>(m_k, p_WV+(2*i+1)*m_k*m_k, p_WV_spare, p_offDiagPerms_left+i*m_k);
2167 thrust::copy(WV_spare.begin(), WV_spare.end(), WV.begin() + ((2*i+1)*m_k*m_k));
2172 template <
typename PrecVector>
2174 Precond<PrecVector>::calculateSpikes_const(PrecVector& WV)
2176 PrecValueType* p_WV = thrust::raw_pointer_cast(&WV[0]);
2180 int last_partition_size = m_n / m_numPartitions;
2181 int first_partition_size = last_partition_size;
2183 if (m_n % m_numPartitions != 0)
2184 first_partition_size++;
2192 int n_eff = m_n - last_partition_size;
2193 int numPart_eff = m_numPartitions - 1;
2194 int partSize = n_eff / numPart_eff;
2195 int remainder = n_eff % numPart_eff;
2197 PrecVector extV(m_k * n_eff, (PrecValueType) 0);
2199 PrecValueType* p_extV = thrust::raw_pointer_cast(&extV[0]);
2200 PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[0]);
2202 dim3 gridsCopy(m_k, numPart_eff);
2203 dim3 gridsSweep(numPart_eff, m_k);
2206 device::copyWVFromOrToExtendedV_general<PrecValueType><<<gridsCopy, 512>>>(n_eff, m_k, partSize, remainder, p_WV, p_extV,
false);
2207 device::forwardElimL_bottom_general<PrecValueType><<<gridsSweep, 512>>>(n_eff, m_k, m_k, p_B, p_extV, partSize, remainder);
2208 device::backwardElimU_bottom_general<PrecValueType><<<gridsSweep, 512>>>(n_eff, m_k, 2*m_k, p_B, p_extV, partSize, remainder);
2209 device::copyWVFromOrToExtendedV_general<PrecValueType><<<gridsCopy, 512>>>(n_eff, m_k, partSize, remainder, p_WV, p_extV,
true);
2210 }
else if (m_k > 32) {
2211 device::copyWVFromOrToExtendedV<PrecValueType><<<gridsCopy, m_k>>>(n_eff, m_k, partSize, remainder, p_WV, p_extV,
false);
2212 device::forwardElimL_bottom_g32<PrecValueType><<<gridsSweep, m_k>>>(n_eff, m_k, m_k, p_B, p_extV, partSize, remainder);
2213 device::backwardElimU_bottom_g32<PrecValueType><<<gridsSweep, m_k>>>(n_eff, m_k, 2*m_k, p_B, p_extV, partSize, remainder);
2214 device::copyWVFromOrToExtendedV<PrecValueType><<<gridsCopy, m_k>>>(n_eff, m_k, partSize, remainder, p_WV, p_extV,
true);
2216 device::copyWVFromOrToExtendedV<PrecValueType><<<gridsCopy, m_k>>>(n_eff, m_k, partSize, remainder, p_WV, p_extV,
false);
2217 device::forwardElimL_bottom<PrecValueType><<<gridsSweep, m_k>>>(n_eff, m_k, m_k, p_B, p_extV, partSize, remainder);
2218 device::backwardElimU_bottom<PrecValueType><<<gridsSweep, m_k>>>(n_eff, m_k, 2*m_k, p_B, p_extV, partSize, remainder);
2219 device::copyWVFromOrToExtendedV<PrecValueType><<<gridsCopy, m_k>>>(n_eff, m_k, partSize, remainder, p_WV, p_extV,
true);
2229 int n_eff = m_n - first_partition_size;
2230 int numPart_eff = m_numPartitions - 1;
2231 int partSize = n_eff / numPart_eff;
2232 int remainder = n_eff % numPart_eff;
2234 PrecVector extW(m_k * n_eff, (PrecValueType) 0);
2236 PrecValueType* p_extW = thrust::raw_pointer_cast(&extW[0]);
2237 PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[(2*m_k+1)*first_partition_size]);
2239 dim3 gridsSweep(numPart_eff, m_k);
2240 dim3 gridsCopy(m_k, numPart_eff);
2243 device::copyWVFromOrToExtendedW_general<PrecValueType><<<gridsCopy, 512>>>(n_eff, m_k, partSize, remainder, p_WV, p_extW,
false);
2244 device::forwardElimL_general<PrecValueType><<<gridsSweep, 512>>>(n_eff, m_k, p_B, p_extW, partSize, remainder);
2245 device::backwardElimU_general<PrecValueType><<<gridsSweep, 512>>>(n_eff, m_k, p_B, p_extW, partSize, remainder);
2246 device::copyWVFromOrToExtendedW_general<PrecValueType><<<gridsCopy, 512>>>(n_eff, m_k, partSize, remainder, p_WV, p_extW,
true);
2247 }
else if (m_k > 32) {
2248 device::copyWVFromOrToExtendedW<PrecValueType><<<gridsCopy, m_k>>>(n_eff, m_k, partSize, remainder, p_WV, p_extW,
false);
2249 device::forwardElimL_g32<PrecValueType><<<gridsSweep, m_k>>>(n_eff, m_k, p_B, p_extW, partSize, remainder);
2250 device::backwardElimU_g32<PrecValueType><<<gridsSweep, m_k>>>(n_eff, m_k, p_B, p_extW, partSize, remainder);
2251 device::copyWVFromOrToExtendedW<PrecValueType><<<gridsCopy, m_k>>>(n_eff, m_k, partSize, remainder, p_WV, p_extW,
true);
2253 device::copyWVFromOrToExtendedW<PrecValueType><<<gridsCopy, m_k>>>(n_eff, m_k, partSize, remainder, p_WV, p_extW,
false);
2254 device::forwardElimL<PrecValueType><<<gridsSweep, m_k>>>(n_eff, m_k, p_B, p_extW, partSize, remainder);
2255 device::backwardElimU<PrecValueType><<<gridsSweep, m_k>>>(n_eff, m_k, p_B, p_extW, partSize, remainder);
2256 device::copyWVFromOrToExtendedW<PrecValueType><<<gridsCopy, m_k>>>(n_eff, m_k, partSize, remainder, p_WV, p_extW,
true);
2261 template <
typename PrecVector>
2263 Precond<PrecVector>::calculateSpikes_var(PrecVector& WV)
2265 PrecVector WV_spare(m_k*m_k);
2267 PrecValueType* p_WV = thrust::raw_pointer_cast(&WV[0]);
2268 PrecValueType* p_WV_spare = thrust::raw_pointer_cast(&WV_spare[0]);
2271 int numThreadsToUse = adjustNumThreads(cusp::blas::nrm1(m_ks) / m_numPartitions);
2273 const int SWEEP_MAX_NUM_THREADS = 128;
2280 int numPart_eff = m_numPartitions;
2281 int partSize = n_eff / numPart_eff;
2282 int remainder = n_eff % numPart_eff;
2283 int rightOffDiagWidth = cusp::blas::nrmmax(m_offDiagWidths_right);
2284 int leftOffDiagWidth = cusp::blas::nrmmax(m_offDiagWidths_left);
2286 PrecVector extWV((leftOffDiagWidth + rightOffDiagWidth) * n_eff, (PrecValueType) 0);
2289 PrecValueType* p_extWV = thrust::raw_pointer_cast(&extWV[0]);
2290 PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[0]);
2291 int* p_secondReordering = thrust::raw_pointer_cast(&m_secondReordering[0]);
2292 int* p_secondPerm = thrust::raw_pointer_cast(&m_secondPerm[0]);
2293 int* p_ks = thrust::raw_pointer_cast(&m_ks[0]);
2294 int* p_offDiagWidths_right = thrust::raw_pointer_cast(&m_offDiagWidths_right[0]);
2295 int* p_offDiagPerms_right = thrust::raw_pointer_cast(&m_offDiagPerms_right[0]);
2296 int* p_offDiagWidths_left = thrust::raw_pointer_cast(&m_offDiagWidths_left[0]);
2297 int* p_offDiagPerms_left = thrust::raw_pointer_cast(&m_offDiagPerms_left[0]);
2298 int* p_first_rows = thrust::raw_pointer_cast(&m_first_rows[0]);
2299 int* p_offsets = thrust::raw_pointer_cast(&m_BOffsets[0]);
2301 int permuteBlockX = leftOffDiagWidth+rightOffDiagWidth;
2302 int permuteGridX = 1;
2303 int permuteGridY = n_eff;
2304 int permuteGridZ = 1;
2307 dim3 gridsPermute(permuteGridX, permuteGridY, permuteGridZ);
2309 buffer.resize((leftOffDiagWidth + rightOffDiagWidth) * n_eff);
2311 PrecValueType* p_buffer = thrust::raw_pointer_cast(&buffer[0]);
2313 dim3 gridsCopy((leftOffDiagWidth + rightOffDiagWidth), numPart_eff);
2315 device::copyWVFromOrToExtendedWVTranspose_general<PrecValueType><<<gridsCopy, numThreadsToUse>>>(leftOffDiagWidth + rightOffDiagWidth, m_k, rightOffDiagWidth, partSize, remainder, m_k-rightOffDiagWidth-leftOffDiagWidth, p_WV, p_extWV,
false);
2316 device::columnPermute<PrecValueType><<<gridsPermute, permuteBlockX>>>(n_eff, leftOffDiagWidth+rightOffDiagWidth, p_extWV, p_buffer, p_secondPerm);
2319 int sweepBlockX = leftOffDiagWidth;
2321 if (sweepBlockX < rightOffDiagWidth)
2322 sweepBlockX = rightOffDiagWidth;
2324 dim3 sweepGrids(sweepGridX, 2*numPart_eff-2);
2326 device::var::fwdElim_spike<PrecValueType><<<sweepGrids, sweepBlockX>>>(n_eff, p_ks, leftOffDiagWidth + rightOffDiagWidth, rightOffDiagWidth, p_offsets, p_B, p_buffer, partSize, remainder, p_offDiagWidths_left, p_offDiagWidths_right, p_first_rows);
2328 int preBckBlockX = leftOffDiagWidth + rightOffDiagWidth;
2329 int preBckGridX = 1;
2330 int preBckGridY = n_eff;
2331 int preBckGridZ = 1;
2334 dim3 preBckGrids(preBckGridX, preBckGridY, preBckGridZ);
2336 device::var::preBck_offDiag_divide<PrecValueType><<<preBckGrids, preBckBlockX>>>(n_eff, leftOffDiagWidth + rightOffDiagWidth, p_ks, p_offsets, p_B, p_buffer, partSize, remainder);
2340 for (
int i = 0; i < m_numPartitions - 1; i++) {
2342 last_row += (partSize + 1);
2344 last_row += partSize;
2346 m_first_rows_host[i] = thrust::reduce(m_secondPerm.begin()+(last_row-m_k), m_secondPerm.begin()+last_row, last_row, thrust::minimum<int>());
2348 m_first_rows = m_first_rows_host;
2349 p_first_rows = thrust::raw_pointer_cast(&m_first_rows[0]);
2352 device::var::bckElim_spike<PrecValueType><<<sweepGrids, sweepBlockX>>>(n_eff, p_ks, leftOffDiagWidth + rightOffDiagWidth, rightOffDiagWidth, p_offsets, p_B, p_buffer, partSize, remainder, p_offDiagWidths_left, p_offDiagWidths_right, p_first_rows);
2356 device::columnPermute<PrecValueType><<<gridsPermute, permuteBlockX>>>(n_eff, leftOffDiagWidth + rightOffDiagWidth, p_buffer, p_extWV, p_secondReordering);
2357 device::copyWVFromOrToExtendedWVTranspose_general<PrecValueType><<<gridsCopy, numThreadsToUse>>>(leftOffDiagWidth + rightOffDiagWidth, m_k, rightOffDiagWidth, partSize, remainder, m_k-rightOffDiagWidth-leftOffDiagWidth, p_WV, p_extWV,
true);
2359 for (
int i = 0; i < numPart_eff - 1; i++) {
2360 cusp::blas::fill(WV_spare, (PrecValueType) 0);
2361 device::matrixVReordering_perPartition<PrecValueType><<<m_offDiagWidths_right_host[i], numThreadsToUse>>>(m_k, p_WV+2*i*m_k*m_k, p_WV_spare, p_offDiagPerms_right+i*m_k);
2362 thrust::copy(WV_spare.begin(), WV_spare.end(), WV.begin()+(2*i*m_k*m_k));
2364 cusp::blas::fill(WV_spare, (PrecValueType) 0);
2365 device::matrixWReordering_perPartition<PrecValueType><<<m_offDiagWidths_left_host[i], numThreadsToUse>>>(m_k, p_WV+(2*i+1)*m_k*m_k, p_WV_spare, p_offDiagPerms_left+i*m_k);
2366 thrust::copy(WV_spare.begin(), WV_spare.end(), WV.begin()+((2*i+1)*m_k*m_k));
2375 template <
typename PrecVector>
2377 Precond<PrecVector>::adjustNumThreads(
int inNumThreads) {
2381 for (
int i = 0; i < 16; i++) {
2383 if (inNumThreads > cur) {
2387 if (inNumThreads - prev > cur - inNumThreads || prev == 0)
2397 template <
typename PrecVector>
2399 Precond<PrecVector>::calculateSpikes(PrecVector& B2,
2402 int two_k = 2 * m_k;
2403 int partSize = m_n / m_numPartitions;
2404 int remainder = m_n % m_numPartitions;
2407 PrecVector compB2((two_k+1)*two_k*(m_numPartitions-1));
2408 cusp::blas::fill(compB2, (PrecValueType) 0);
2410 PrecValueType* p_B2 = thrust::raw_pointer_cast(&B2[0]);
2411 PrecValueType* p_compB2 = thrust::raw_pointer_cast(&compB2[0]);
2413 dim3 gridsCompress(two_k, m_numPartitions-1);
2416 device::copydAtodA2_general<PrecValueType><<<gridsCompress, 1024>>>(m_n, m_k, p_B2, p_compB2, two_k, partSize, m_numPartitions, remainder);
2418 device::copydAtodA2<PrecValueType><<<gridsCompress, two_k+1>>>(m_n, m_k, p_B2, p_compB2, two_k, partSize, m_numPartitions, remainder);
2421 PrecVector partialB(2*(two_k+1)*(m_k+1)*(m_numPartitions-1));
2423 PrecValueType* p_B = thrust::raw_pointer_cast(&m_B[0]);
2424 PrecValueType* p_partialB = thrust::raw_pointer_cast(&partialB[0]);
2426 dim3 gridsCopy(m_k+1, 2*(m_numPartitions-1));
2429 device::copydAtoPartialA_general<PrecValueType><<<gridsCopy, 1024>>>(m_n, m_k, p_B, p_compB2, p_partialB, partSize, m_numPartitions, remainder, two_k);
2431 device::copydAtoPartialA<PrecValueType><<<gridsCopy, two_k+1>>>(m_n, m_k, p_B, p_compB2, p_partialB, partSize, m_numPartitions, remainder, two_k);
2434 PrecValueType* p_WV = thrust::raw_pointer_cast(&WV[0]);
2436 dim3 gridsSweep(m_numPartitions-1, m_k);
2439 device::forwardElimLdWV_general<PrecValueType><<<gridsSweep, 512>>>(m_k, p_partialB, p_WV, m_k, 0, 0);
2440 device::backwardElimUdWV_general<PrecValueType><<<gridsSweep, 512>>>(m_k, p_partialB, p_WV, m_k, 0, 1);
2441 device::backwardElimUdWV_general<PrecValueType><<<gridsSweep, 512>>>(m_k, p_partialB, p_WV, m_k, 1, 0);
2442 device::forwardElimLdWV_general<PrecValueType><<<gridsSweep, 512>>>(m_k, p_partialB, p_WV, m_k, 1, 1);
2443 }
else if (m_k > 32) {
2444 device::forwardElimLdWV_g32<PrecValueType><<<gridsSweep, m_k>>>(m_k, p_partialB, p_WV, m_k, 0, 0);
2445 device::backwardElimUdWV_g32<PrecValueType><<<gridsSweep, m_k>>>(m_k, p_partialB, p_WV, m_k, 0, 1);
2446 device::backwardElimUdWV_g32<PrecValueType><<<gridsSweep, m_k>>>(m_k, p_partialB, p_WV, m_k, 1, 0);
2447 device::forwardElimLdWV_g32<PrecValueType><<<gridsSweep, m_k>>>(m_k, p_partialB, p_WV, m_k, 1, 1);
2449 device::forwardElimLdWV<PrecValueType><<<gridsSweep, m_k>>>(m_k, p_partialB, p_WV, m_k, 0, 0);
2450 device::backwardElimUdWV<PrecValueType><<<gridsSweep, m_k>>>(m_k, p_partialB, p_WV, m_k, 0, 1);
2451 device::backwardElimUdWV<PrecValueType><<<gridsSweep, m_k>>>(m_k, p_partialB, p_WV, m_k, 1, 0);
2452 device::forwardElimLdWV<PrecValueType><<<gridsSweep, m_k>>>(m_k, p_partialB, p_WV, m_k, 1, 1);
2459 template <
typename PrecVector>
2463 PrecValueType* p_WV = thrust::raw_pointer_cast(&WV[0]);
2464 PrecValueType* p_R = thrust::raw_pointer_cast(&m_R[0]);
2466 dim3 grids(m_k, m_numPartitions-1);
2468 if (!m_variableBandwidth) {
2470 device::assembleReducedMat_general<PrecValueType><<<grids, 512>>>(m_k, p_WV, p_R);
2472 device::assembleReducedMat_g32<PrecValueType><<<grids, m_k>>>(m_k, p_WV, p_R);
2474 device::assembleReducedMat<PrecValueType><<<m_numPartitions-1, m_k*m_k>>>(m_k, p_WV, p_R);
2476 int* p_WVOffsets = thrust::raw_pointer_cast(&m_WVOffsets[0]);
2477 int* p_ROffsets = thrust::raw_pointer_cast(&m_ROffsets[0]);
2478 int* p_spike_ks = thrust::raw_pointer_cast(&m_spike_ks[0]);
2481 device::assembleReducedMat_var_bandwidth_general<PrecValueType><<<grids, 512>>>(p_spike_ks, p_WVOffsets, p_ROffsets, p_WV, p_R);
2483 device::assembleReducedMat_var_bandwidth_g32<PrecValueType><<<grids, m_k>>>(p_spike_ks, p_WVOffsets, p_ROffsets, p_WV, p_R);
2485 device::assembleReducedMat_var_bandwidth<PrecValueType><<<m_numPartitions-1, m_k*m_k>>>(p_spike_ks, p_WVOffsets, p_ROffsets, p_WV, p_R);
2493 template <
typename PrecVector>
2495 Precond<PrecVector>::copyLastPartition(PrecVector &B2) {
2496 thrust::copy(B2.begin()+(2*m_k+1) * (m_n - m_n / m_numPartitions), B2.end(), m_B.begin()+(2*m_k+1) * (m_n - m_n / m_numPartitions) );
2504 template <
typename PrecVector>
2506 Precond<PrecVector>::hasZeroPivots(
const PrecVectorIterator& start_B,
2507 const PrecVectorIterator& end_B,
2509 PrecValueType threshold)
2512 strided_range<typename PrecVector::iterator> diag(start_B + k, end_B, 2*k + 1);
2519 return thrust::any_of(diag.begin(), diag.end(), SmallerThan<PrecValueType>(threshold));
double gettimeAssembly() const
Definition: precond.h:84
int getNumPartitions() const
Definition: precond.h:92
cusp::array1d< int, MemorySpace > IntVector
Definition: precond.h:44
Precond()
Definition: precond.h:312
double getTimeFullLU() const
Definition: precond.h:85
const unsigned int MAX_GRID_DIMENSION
Definition: common.h:51
cusp::array1d< int, cusp::host_memory > IntVectorH
Definition: precond.h:49
IntVector MatrixMap
Definition: precond.h:45
IntVectorH MatrixMapH
Definition: precond.h:50
double getTimeShuffle() const
Definition: precond.h:86
double getTimeTransfer() const
Definition: precond.h:79
Definition: exception.h:18
double getTimeBandUL() const
Definition: precond.h:83
GPU timer.
Definition: timer.h:35
CPU and GPU timer classes.
void setup(const Matrix &A)
Definition: precond.h:565
FactorizationMethod
Definition: common.h:60
PrecVector::memory_space MemorySpace
Definition: precond.h:40
PreconditionerType
Definition: common.h:65
int getBandwidthMC64() const
Definition: precond.h:89
cusp::array1d< PrecValueType, cusp::host_memory > MatrixMapFH
Definition: precond.h:51
#define BURST_VALUE
Definition: common.h:43
int getBandwidthReordering() const
Definition: precond.h:88
const unsigned int CRITICAL_THRESHOLD
Definition: common.h:53
cusp::array1d< PrecValueType, cusp::host_memory > PrecVectorH
Definition: precond.h:48
double getTimeCopyOffDiags() const
Definition: precond.h:81
~Precond()
Definition: precond.h:73
double getTimeCPUAssemble() const
Definition: precond.h:78
int getBandwidth() const
Definition: precond.h:90
cusp::coo_matrix< int, PrecValueType, cusp::host_memory > PrecMatrixCooH
Definition: precond.h:54
PrecVector::iterator PrecVectorIterator
Definition: precond.h:42
__global__ void permute(int N, T *ori_array, T *final_array, int *per_array)
Definition: shuffle.cuh:12
__host__ __device__ bool operator()(T val)
Definition: precond.h:260
cusp::array1d< PrecValueType, MemorySpace > MatrixMapF
Definition: precond.h:46
Spike preconditioner.
Definition: precond.h:37
Precond & operator=(const Precond &prec)
Definition: precond.h:368
void update(const PrecVector &entries)
This function updates the banded matrix and off-diagonal matrices based on the given entries...
Definition: precond.h:406
cusp::array1d< int, cusp::host_memory > IntVectorH
Definition: bicgstab2.h:20
Definition: precond.h:255
T m_threshold
Definition: precond.h:262
double getTimeReorder() const
Definition: precond.h:77
void solve(PrecVector &v, PrecVector &z)
Definition: precond.h:689
void kernelConfigAdjust(int &numThreads, int &numBlockX, const int numThreadsMax)
Definition: common.h:71
double getActualDropOff() const
Definition: precond.h:93
Definition: precond.h:246
cusp::coo_matrix< int, PrecValueType, MemorySpace > PrecMatrixCoo
Definition: precond.h:53
const unsigned int BLOCK_SIZE
Definition: common.h:49
__global__ void assembleReducedMat(int k, T *dWV, T *d_comp)
Definition: data_transfer.cuh:16
double getTimeBandLU() const
Definition: precond.h:82
__host__ __device__ T operator()(thrust::tuple< T, T > tu)
Definition: precond.h:249
PrecVector::value_type PrecValueType
Definition: precond.h:41
double getTimeToBanded() const
Definition: precond.h:80
SmallerThan(T threshold)
Definition: precond.h:257