#include #include "mpl_blas.h" #define PRECISION double void mpl_dtri_run(diag,m,n,a0,a1,b0,b1) /*************************************************************************** * * * DATA PARALLEL BLAS based on MPL * * * * Internal routine, this routine is not supposed to be * * called by user programs. * * * * Version 1.0 1/4-92 , * * For MasPar MP-1 computers * * * * para//ab, University of Bergen, NORWAY * * * * The calling sequence may be changed in a future version. * * Please report any BUGs, ideas for improvement or other * * comments to * * adm@parallab.uib.no * * * * Future versions may then reflect your suggestions. * * The most current version of this software is available * * from netlib@nac.no , send the message `send index from maspar' * * * * REVISIONS: * * * ***************************************************************************/ /* The following routine performs upper triangular solve with multiple * right hand sides on m*m A-matrices and m*n B matrices with * (1<= m <= nxproc) and (1<= n <= nxproc). Here the A matrix is the * triangle, while the B matrix represents the collection of rhs's * arranged columnwise. * After execution the B matrix is overwritten by the answers. * This implementation is based on the standard column oriented back * substitution algorithm. The systems are solved concurrently. This is * done by spreading the appropriate column of the A matrix to all * columns on the PE grid, so that it can be used in processing each * B vector. * This routine applies to rectangular machines only, and the data storage * is as follows: * * If m <= nyproc, then each PE in the upper left m*m square of the dpu * contains one element each of the A matrix, and the upper left m*n * rectangle contains one element each of the B matrix, in the variables * a0 and b0 respectively. * If m > nyproc, the variables a1 and b1 are used to contain the rest of * the matrices. This is done such that the (nyproc+1)'th row of matrices * A and B is stored in the first row of PE's, the (nyproc+2)'th row in the * second row, and so on. Some PE's will then contain two matrix elements. * * Virtually it then looks like this (A matrix): * * * * | * 0 .....0...... 0 | * | * * 0 ...........0 | * a* | : : *.0.........0 | nyproc * | * ... * 0 .... 0 | * | ===================| * | * ......* * * ...0 | * a1 | : : : | nyproc * | * .............** | * * <----- nxproc ------> * * * The matrix appears transposed on the dpu due to MPF's storage format. * */ register int diag; register int m,n; plural PRECISION a0,a1; plural PRECISION *b0,*b1; { register int i,nx=nxproc,ny=nyproc, tmp_m; register plural int ix=ixproc,iy=iyproc; register plural PRECISION b0_wrk,b1_wrk,temp0,temp1,a_wrk; b0_wrk = *b0; b1_wrk = *b1; /* * This branch is executed if the A matrix is exactly * nxproc * nxproc large */ if (m == nx) { if ((diag != 'u') && (diag != 'U')) /* Scale A-matrix by its diagonal */ { if (ix == iy) xnetcW[nx].temp0 = a0; if (ix == iy+ny) xnetcW[nx].temp1 = a1; if (ix=0; i--) { if (iy == i) /* Spread (i+nyproc)'th A-column to all other columns */ xnetcN[ny].a_wrk = a1; /* Eliminating entries above diagonal of A matrix */ if (ix == i+ny) { xnetcW[nx].temp0 = b0_wrk; xnetcW[nx].temp1 = b1_wrk; } if (ix < i+ny) { b1_wrk -= a_wrk * temp1; b0_wrk -= a_wrk * temp0; } } /* * Loop performing column by column processing of the A matrix * (first half) */ for (i =ny-1;i>=0; i--) { if (iy == i) /* Spread i'th A-column to all other columns */ xnetcN[ny].a_wrk = a0; /* Eliminating entries above diagonal of A matrix */ if (ix == i) { xnetcW[i].temp0 = b0_wrk; xnetcW[i].temp1 = b1_wrk; } if (ix < i) { b0_wrk -= a_wrk * temp0; b1_wrk -= a_wrk * temp1; } } if ((diag != 'u') && (diag != 'U')) /* scale result by diagonal */ { if (ix == iy) xnetcS[ny].temp0 = a0; if (ix == iy+ny) xnetcS[ny].temp0 = a1; b0_wrk /= temp0; b1_wrk /= temp0; } } else /* if (m != nx) *****************************************/ /* * This code is executed for problems where the A matrix * is less than nxproc */ { if ((diag != 'u') && (diag != 'U')) /* Scale A matrix by its diagonal */ { if (ix == iy) xnetcW[nx].temp0 = a0; if (ix == iy+ny) xnetcW[nx].temp1 = a1; if ((ix ny) for (i=m-1-ny;i>=0;i--) { if (iy == i) /* Spread i'th A-column to all other columns */ { xnetcN[ny].a_wrk = a1; } /* Eliminating entries above diagonal of A matrix */ if (ix == i+ny) { xnetcW[nx].temp0 = b0_wrk; xnetcW[nx].temp1 = b1_wrk; } if (ix < i+ny) { b0_wrk -= a_wrk * temp0; b1_wrk -= a_wrk * temp1; } } if (m < ny) tmp_m = m - 1; else tmp_m = ny -1; for (i=tmp_m;i>=0;i--) { if (iy == i) /* Spread i'th A-column to all other columns */ { xnetcN[ny].a_wrk = a0; } /* Eliminating entries above diagonal of A matrix */ if (ix == i) { xnetcW[i].temp0 = b0_wrk; xnetcW[i].temp1 = b1_wrk; } if (ix < i) { b0_wrk -= a_wrk * temp0; b1_wrk -= a_wrk * temp1; } } if ((diag != 'u') && (diag != 'U')) /* scale result by diagonal */ { if (ix == iy) xnetcS[ny].temp0 = a0; if (ix == iy+ny) xnetcS[ny].temp0 = a1; if (ix < m) { b0_wrk /= temp0; b1_wrk /= temp0; } } } if (ix < m) { if (iy < n) *b0 =b0_wrk; if (iy+ny < n) *b1 =b1_wrk; } }