Actual source code: bvblas.c
slepc-3.8.2 2017-12-01
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2017, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV private kernels that use the BLAS
12: */
14: #include <slepc/private/bvimpl.h>
15: #include <slepcblaslapack.h>
17: #define BLOCKSIZE 64
19: /*
20: C := alpha*A*B + beta*C
22: A is mxk (ld=m), B is kxn (ld=ldb), C is mxn (ld=m)
23: */
24: PetscErrorCode BVMult_BLAS_Private(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,PetscInt ldb_,PetscScalar alpha,const PetscScalar *A,const PetscScalar *B,PetscScalar beta,PetscScalar *C)
25: {
27: PetscBLASInt m,n,k,ldb;
28: #if defined(PETSC_HAVE_FBLASLAPACK) || defined(PETSC_HAVE_F2CBLASLAPACK)
29: PetscBLASInt l,bs=BLOCKSIZE;
30: #endif
33: PetscBLASIntCast(m_,&m);
34: PetscBLASIntCast(n_,&n);
35: PetscBLASIntCast(k_,&k);
36: PetscBLASIntCast(ldb_,&ldb);
37: #if defined(PETSC_HAVE_FBLASLAPACK) || defined(PETSC_HAVE_F2CBLASLAPACK)
38: l = m % bs;
39: if (l) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&k,&alpha,(PetscScalar*)A,&m,(PetscScalar*)B,&ldb,&beta,C,&m));
40: for (;l<m;l+=bs) {
41: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bs,&n,&k,&alpha,(PetscScalar*)A+l,&m,(PetscScalar*)B,&ldb,&beta,C+l,&m));
42: }
43: #else
44: if (m) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&m,&n,&k,&alpha,(PetscScalar*)A,&m,(PetscScalar*)B,&ldb,&beta,C,&m));
45: #endif
46: PetscLogFlops(2.0*m*n*k);
47: return(0);
48: }
50: /*
51: y := alpha*A*x + beta*y
53: A is nxk (ld=n)
54: */
55: PetscErrorCode BVMultVec_BLAS_Private(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *A,const PetscScalar *x,PetscScalar beta,PetscScalar *y)
56: {
58: PetscBLASInt n,k,one=1;
61: PetscBLASIntCast(n_,&n);
62: PetscBLASIntCast(k_,&k);
63: if (n) PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&n,&k,&alpha,A,&n,x,&one,&beta,y,&one));
64: PetscLogFlops(2.0*n*k);
65: return(0);
66: }
68: /*
69: A(:,s:e-1) := A*B(:,s:e-1)
71: A is mxk (ld=m), B is kxn (ld=ldb) n=e-s
72: */
73: PetscErrorCode BVMultInPlace_BLAS_Private(BV bv,PetscInt m_,PetscInt k_,PetscInt ldb_,PetscInt s,PetscInt e,PetscScalar *A,const PetscScalar *B,PetscBool btrans)
74: {
76: PetscScalar *pb,zero=0.0,one=1.0;
77: PetscBLASInt m,n,k,l,ldb,bs=BLOCKSIZE;
78: PetscInt j,n_=e-s;
79: const char *bt;
82: PetscBLASIntCast(m_,&m);
83: PetscBLASIntCast(n_,&n);
84: PetscBLASIntCast(k_,&k);
85: PetscBLASIntCast(ldb_,&ldb);
86: BVAllocateWork_Private(bv,BLOCKSIZE*n_);
87: if (btrans) {
88: pb = (PetscScalar*)B+s;
89: bt = "C";
90: } else {
91: pb = (PetscScalar*)B+s*ldb;
92: bt = "N";
93: }
94: l = m % bs;
95: if (l) {
96: PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&l,&n,&k,&one,A,&m,pb,&ldb,&zero,bv->work,&l));
97: for (j=0;j<n;j++) {
98: PetscMemcpy(A+(s+j)*m,bv->work+j*l,l*sizeof(PetscScalar));
99: }
100: }
101: for (;l<m;l+=bs) {
102: PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&bs,&n,&k,&one,A+l,&m,pb,&ldb,&zero,bv->work,&bs));
103: for (j=0;j<n;j++) {
104: PetscMemcpy(A+(s+j)*m+l,bv->work+j*bs,bs*sizeof(PetscScalar));
105: }
106: }
107: PetscLogFlops(2.0*m*n*k);
108: return(0);
109: }
111: /*
112: V := V*B
114: V is mxn (ld=m), B is nxn (ld=k)
115: */
116: PetscErrorCode BVMultInPlace_Vecs_Private(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,Vec *V,const PetscScalar *B,PetscBool btrans)
117: {
118: PetscErrorCode ierr;
119: PetscScalar zero=0.0,one=1.0,*out,*pout;
120: const PetscScalar *pin;
121: PetscBLASInt m,n,k,l,bs=BLOCKSIZE;
122: PetscInt j;
123: const char *bt;
126: PetscBLASIntCast(m_,&m);
127: PetscBLASIntCast(n_,&n);
128: PetscBLASIntCast(k_,&k);
129: BVAllocateWork_Private(bv,2*BLOCKSIZE*n_);
130: out = bv->work+BLOCKSIZE*n_;
131: if (btrans) bt = "C";
132: else bt = "N";
133: l = m % bs;
134: if (l) {
135: for (j=0;j<n;j++) {
136: VecGetArrayRead(V[j],&pin);
137: PetscMemcpy(bv->work+j*l,pin,l*sizeof(PetscScalar));
138: VecRestoreArrayRead(V[j],&pin);
139: }
140: PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&l,&n,&n,&one,bv->work,&l,(PetscScalar*)B,&k,&zero,out,&l));
141: for (j=0;j<n;j++) {
142: VecGetArray(V[j],&pout);
143: PetscMemcpy(pout,out+j*l,l*sizeof(PetscScalar));
144: VecRestoreArray(V[j],&pout);
145: }
146: }
147: for (;l<m;l+=bs) {
148: for (j=0;j<n;j++) {
149: VecGetArrayRead(V[j],&pin);
150: PetscMemcpy(bv->work+j*bs,pin+l,bs*sizeof(PetscScalar));
151: VecRestoreArrayRead(V[j],&pin);
152: }
153: PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&bs,&n,&n,&one,bv->work,&bs,(PetscScalar*)B,&k,&zero,out,&bs));
154: for (j=0;j<n;j++) {
155: VecGetArray(V[j],&pout);
156: PetscMemcpy(pout+l,out+j*bs,bs*sizeof(PetscScalar));
157: VecRestoreArray(V[j],&pout);
158: }
159: }
160: PetscLogFlops(2.0*n*n*k);
161: return(0);
162: }
164: /*
165: B := alpha*A + beta*B
167: A,B are nxk (ld=n)
168: */
169: PetscErrorCode BVAXPY_BLAS_Private(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *A,PetscScalar beta,PetscScalar *B)
170: {
172: PetscBLASInt m,one=1;
175: PetscBLASIntCast(n_*k_,&m);
176: if (beta!=(PetscScalar)1.0) {
177: PetscStackCallBLAS("BLASscal",BLASscal_(&m,&beta,B,&one));
178: PetscLogFlops(m);
179: }
180: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&m,&alpha,A,&one,B,&one));
181: PetscLogFlops(2.0*m);
182: return(0);
183: }
185: /*
186: C := A'*B
188: A' is mxk (ld=k), B is kxn (ld=k), C is mxn (ld=ldc)
189: */
190: PetscErrorCode BVDot_BLAS_Private(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,PetscInt ldc_,const PetscScalar *A,const PetscScalar *B,PetscScalar *C,PetscBool mpi)
191: {
193: PetscScalar zero=0.0,one=1.0,*CC;
194: PetscBLASInt m,n,k,ldc,j;
195: PetscMPIInt len;
198: PetscBLASIntCast(m_,&m);
199: PetscBLASIntCast(n_,&n);
200: PetscBLASIntCast(k_,&k);
201: PetscBLASIntCast(ldc_,&ldc);
202: if (mpi) {
203: if (ldc==m) {
204: BVAllocateWork_Private(bv,m*n);
205: if (k) PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&m,&n,&k,&one,(PetscScalar*)A,&k,(PetscScalar*)B,&k,&zero,bv->work,&ldc));
206: else { PetscMemzero(bv->work,m*n*sizeof(PetscScalar)); }
207: PetscMPIIntCast(m*n,&len);
208: MPI_Allreduce(bv->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv));
209: } else {
210: BVAllocateWork_Private(bv,2*m*n);
211: CC = bv->work+m*n;
212: if (k) PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&m,&n,&k,&one,(PetscScalar*)A,&k,(PetscScalar*)B,&k,&zero,bv->work,&m));
213: else { PetscMemzero(bv->work,m*n*sizeof(PetscScalar)); }
214: PetscMPIIntCast(m*n,&len);
215: MPI_Allreduce(bv->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv));
216: for (j=0;j<n;j++) {
217: PetscMemcpy(C+j*ldc,CC+j*m,m*sizeof(PetscScalar));
218: }
219: }
220: } else {
221: if (k) PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&m,&n,&k,&one,(PetscScalar*)A,&k,(PetscScalar*)B,&k,&zero,C,&ldc));
222: }
223: PetscLogFlops(2.0*m*n*k);
224: return(0);
225: }
227: /*
228: y := A'*x
230: A is nxk (ld=n)
231: */
232: PetscErrorCode BVDotVec_BLAS_Private(BV bv,PetscInt n_,PetscInt k_,const PetscScalar *A,const PetscScalar *x,PetscScalar *y,PetscBool mpi)
233: {
235: PetscScalar zero=0.0,done=1.0;
236: PetscBLASInt n,k,one=1;
237: PetscMPIInt len;
240: PetscBLASIntCast(n_,&n);
241: PetscBLASIntCast(k_,&k);
242: if (mpi) {
243: BVAllocateWork_Private(bv,k);
244: if (n) {
245: PetscStackCallBLAS("BLASgemv",BLASgemv_("C",&n,&k,&done,A,&n,x,&one,&zero,bv->work,&one));
246: } else {
247: PetscMemzero(bv->work,k*sizeof(PetscScalar));
248: }
249: PetscMPIIntCast(k,&len);
250: MPI_Allreduce(bv->work,y,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv));
251: } else {
252: if (n) PetscStackCallBLAS("BLASgemv",BLASgemv_("C",&n,&k,&done,A,&n,x,&one,&zero,y,&one));
253: }
254: PetscLogFlops(2.0*n*k);
255: return(0);
256: }
258: /*
259: Scale n scalars
260: */
261: PetscErrorCode BVScale_BLAS_Private(BV bv,PetscInt n_,PetscScalar *A,PetscScalar alpha)
262: {
264: PetscBLASInt n,one=1;
267: if (alpha == (PetscScalar)0.0) {
268: PetscMemzero(A,n_*sizeof(PetscScalar));
269: } else if (alpha!=(PetscScalar)1.0) {
270: PetscBLASIntCast(n_,&n);
271: PetscStackCallBLAS("BLASscal",BLASscal_(&n,&alpha,A,&one));
272: PetscLogFlops(n);
273: }
274: return(0);
275: }
277: /*
278: Compute ||A|| for an mxn matrix
279: */
280: PetscErrorCode BVNorm_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,NormType type,PetscReal *nrm,PetscBool mpi)
281: {
283: PetscBLASInt m,n,i,j;
284: PetscMPIInt len;
285: PetscReal lnrm,*rwork=NULL,*rwork2=NULL;
288: PetscFPTrapPush(PETSC_FP_TRAP_OFF);
289: PetscBLASIntCast(m_,&m);
290: PetscBLASIntCast(n_,&n);
291: if (type==NORM_FROBENIUS || type==NORM_2) {
292: lnrm = LAPACKlange_("F",&m,&n,(PetscScalar*)A,&m,rwork);
293: if (mpi) {
294: lnrm = lnrm*lnrm;
295: MPI_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv));
296: *nrm = PetscSqrtReal(*nrm);
297: } else *nrm = lnrm;
298: PetscLogFlops(2.0*m*n);
299: } else if (type==NORM_1) {
300: if (mpi) {
301: BVAllocateWork_Private(bv,2*n_);
302: rwork = (PetscReal*)bv->work;
303: rwork2 = rwork+n_;
304: PetscMemzero(rwork,n_*sizeof(PetscReal));
305: PetscMemzero(rwork2,n_*sizeof(PetscReal));
306: for (j=0;j<n_;j++) {
307: for (i=0;i<m_;i++) {
308: rwork[j] += PetscAbsScalar(A[i+j*m_]);
309: }
310: }
311: PetscMPIIntCast(n_,&len);
312: MPI_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv));
313: *nrm = 0.0;
314: for (j=0;j<n_;j++) if (rwork2[j] > *nrm) *nrm = rwork2[j];
315: } else {
316: *nrm = LAPACKlange_("O",&m,&n,(PetscScalar*)A,&m,rwork);
317: }
318: PetscLogFlops(1.0*m*n);
319: } else if (type==NORM_INFINITY) {
320: BVAllocateWork_Private(bv,m_);
321: rwork = (PetscReal*)bv->work;
322: lnrm = LAPACKlange_("I",&m,&n,(PetscScalar*)A,&m,rwork);
323: if (mpi) {
324: MPI_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)bv));
325: } else *nrm = lnrm;
326: PetscLogFlops(1.0*m*n);
327: }
328: PetscFPTrapPop();
329: return(0);
330: }
332: /*
333: QR factorization of an mxn matrix
334: */
335: PetscErrorCode BVOrthogonalize_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscScalar *R)
336: {
337: #if defined(PETSC_MISSING_LAPACK_GEQRF) || defined(PETSC_MISSING_LAPACK_ORGQR)
339: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GEQRF/ORGQR - Lapack routines are unavailable");
340: #else
342: PetscBLASInt m,n,i,j,k,l,nb,lwork,info;
343: PetscScalar *tau,*work,*Rl=NULL,*A=NULL,*C=NULL,one=1.0,zero=0.0;
344: PetscMPIInt rank,size,len;
345: PetscBool mpi;
348: PetscFPTrapPush(PETSC_FP_TRAP_OFF);
349: PetscBLASIntCast(m_,&m);
350: PetscBLASIntCast(n_,&n);
351: k = PetscMin(m,n);
352: nb = 16;
353: MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size);
354: if (size>2) SETERRQ(PetscObjectComm((PetscObject)bv),1,"Not implemented yet for more than two MPI processes");
355: mpi = size>1? PETSC_TRUE: PETSC_FALSE;
356: if (mpi) {
357: MPI_Comm_rank(PetscObjectComm((PetscObject)bv),&rank);
358: BVAllocateWork_Private(bv,k+n*nb+n*n+n*n*size+m*n);
359: } else {
360: BVAllocateWork_Private(bv,k+n*nb);
361: }
362: tau = bv->work;
363: work = bv->work+k;
364: PetscBLASIntCast(n*nb,&lwork);
365: if (mpi) {
366: Rl = bv->work+k+n*nb;
367: A = bv->work+k+n*nb+n*n;
368: C = bv->work+k+n*nb+n*n+n*n*size;
369: }
371: /* Compute QR */
372: PetscStackCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,Q,&m,tau,work,&lwork,&info));
373: SlepcCheckLapackInfo("geqrf",info);
375: /* Extract R */
376: if (R || mpi) {
377: PetscMemzero(mpi? Rl: R,n*n*sizeof(PetscScalar));
378: for (j=0;j<n;j++) {
379: for (i=0;i<=j;i++) {
380: if (mpi) Rl[i+j*n] = Q[i+j*m];
381: else R[i+j*n] = Q[i+j*m];
382: }
383: }
384: }
386: /* Compute orthogonal matrix in Q */
387: PetscStackCallBLAS("LAPACKungqr",LAPACKungqr_(&m,&n,&k,Q,&m,tau,work,&lwork,&info));
388: SlepcCheckLapackInfo("ungqr",info);
390: if (mpi) {
392: /* Stack triangular matrices */
393: PetscBLASIntCast(n*size,&l);
394: PetscMPIIntCast(n,&len);
395: for (j=0;j<n;j++) {
396: MPI_Allgather(Rl+j*n,len,MPIU_SCALAR,A+j*l,len,MPIU_SCALAR,PetscObjectComm((PetscObject)bv));
397: }
399: /* Compute QR */
400: PetscStackCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&l,&n,A,&l,tau,work,&lwork,&info));
401: SlepcCheckLapackInfo("geqrf",info);
403: /* Extract R */
404: if (R) {
405: PetscMemzero(R,n*n*sizeof(PetscScalar));
406: for (j=0;j<n;j++)
407: for (i=0;i<=j;i++)
408: R[i+j*n] = A[i+j*l];
409: }
411: /* Accumulate orthogonal matrix */
412: PetscStackCallBLAS("LAPACKungqr",LAPACKungqr_(&l,&n,&n,A,&l,tau,work,&lwork,&info));
413: SlepcCheckLapackInfo("ungqr",info);
414: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&m,&n,&n,&one,Q,&m,A+rank*n,&l,&zero,C,&m));
415: PetscMemcpy(Q,C,m*n*sizeof(PetscScalar));
416: }
418: PetscLogFlops(3.0*m*n*n);
419: PetscFPTrapPop();
420: return(0);
421: #endif
422: }