#include #include "mpl_blas.h" #define PRECISION double void mpl_dtrsv_up(diag,n,lda,a,init_x) /*************************************************************************** * * * DATA PARALLEL BLAS based on MPL * * * * Internal routine, this routine is not supposed to be * * called by user programs. * * * * Version 1.0 1/4-92 , * * For MasPar MP-1 computers * * * * para//ab, University of Bergen, NORWAY * * * * The calling sequence may be changed in a future version. * * Please report any BUGs, ideas for improvement or other * * comments to * * adm@parallab.uib.no * * * * Future versions may then reflect your suggestions. * * The most current version of this software is available * * from netlib@nac.no , send the message `send index from maspar' * * * * REVISIONS: * * * ***************************************************************************/ /* This routine handles the upper triangular part of STRSV The routine is written for arbitrary sized matrices. To solve diagonal blocs of size nxproc x nxproc it calls: mpl_dursolve.m (for rectangular MasPar) mpl_dussolve.m (for square MasPar) The overall algorithm is a row-oriented bloc triangular solver. The two parts of the routine are: i) elementwise forward substitution on diagonal blocks Done in the routine mpl_du*solve.m ii) Update of right hand side. To avoid transmission of vectors this part is written inline in stead of calling dgemv (matrix-vector multiply). */ int diag; int n,lda; plural PRECISION *init_x,*a; { plural PRECISION *x,x0_tmp[24],x1_tmp[24]; /* declaration of x0_tmp will be fixed in next release */ plural PRECISION x0_val,x1_val,a0_val,a1_val; register plural PRECISION tmp1,tmp2; register plural int ix=ixproc, iy=iyproc ; register int mask,j,ii,nbx,i,jmod; int n_mod_nx,nx=nxproc,ny=nyproc,size,imod,jind,iter; void mpl_dursolve(); void mpl_dussolve(); nbx = NBX(lda); /* # blocks in x-direction */ iter = NBX(n); /* # non-empty blocks */ n_mod_nx = n & (nx-1); x = init_x; if (ny != nx) { /* rectangular machine */ for(j=iter-1; j>=0;j--) /* loop over the nbx diagonal blocks */ { size = nx; jmod = j&(ny-1); jind = j/ny; if (n_mod_nx && (j == iter-1)) size=n_mod_nx; /* update subdiagonal blocks by matrix-vector multiply */ tmp2 = 0; /* First loop on each processors */ for(i = j+1; i< iter; i++) { tmp2 += a[nbx*2*i+j]* x0_tmp[i] ; tmp2 += a[nbx*(2*i+1)+j]* x1_tmp[i]; } /* Next across the processors */ ii = 0; for (mask=1; mask < ny ; mask <<= 1) /* log sum */ { if ((iy & (ii+=mask))==0) { tmp2 += xnetpS[mask].tmp2; } } if (iy == 0) xnetpS[jmod].tmp2 = tmp2; if (iy == jmod)x[jind] -= tmp2; /* update rhs */ /* place the current diagonal blocks into temporary register variables */ a0_val = a[nbx*2*j+j]; a1_val = a[nbx*(2*j+1)+j]; if (iy==0) x0_val = xnetpS[jmod].x[jind]; /* solve diagonal bloc */ mpl_dursolve(diag,size,a0_val,a1_val,&x0_val,&x1_val); if (ix==iy) xnetcN[ny].tmp1=x0_val; if (ix==iy+ny) xnetcN[ny].tmp1=x1_val; if (iy==jmod) x[jind] = tmp1; /* broadcast x-values for multiplication */ if (ix==iy) xnetcW[nx].x0_tmp[j]=x0_val; if (ix==iy+ny) xnetcW[nx].x1_tmp[j]=x1_val; } } else {/* square machine */ for(j=iter-1; j>=0 ;j--) /* loop over the nbx diagonal blocks */ { jmod = j&(ny-1); jind = j/ny; size = nx; if (n_mod_nx && (j == iter-1)) size=n_mod_nx; /* update subdiagonal blocks by matrix-vector multiply */ tmp2 = 0; /* First loop on each processors */ for(i = j+1; i< iter ;i++) { tmp2 += a[nbx*i+j]* x0_tmp[i] ; } /* Next across the processors */ ii = 0; for (mask=1; mask < ny ; mask <<= 1) /* log sum */ { if ((iy & (ii+=mask))==0) { tmp2 += xnetpS[mask].tmp2; } } if (iy == 0) xnetpS[jmod].tmp2 = tmp2; if (iy == jmod)x[jind] -= tmp2; /* update rhs */ /* place the current diagonal blocks into temporary register variables */ a0_val = a[nbx*j+j]; if (iy==0) x0_val = xnetpS[jmod].x[jind]; /* solve diagonal bloc */ mpl_dussolve(diag,size,a0_val,&x0_val); if (ix==iy) xnetcN[ny].tmp1=x0_val; if (iy==jmod) x[jind] = tmp1; /* broadcast x-values for multiplication */ if (ix==iy) xnetcW[nx].x0_tmp[j]=x0_val; } } }