Actual source code: sveccuda.cu

slepc-3.8.2 2017-12-01
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2017, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV implemented as a single Vec (CUDA version)
 12: */

 14: #include <slepc/private/bvimpl.h>
 15: #include "../svecimpl.h"

 17: #define BLOCKSIZE 64

 19: /*
 20:     B := alpha*A + beta*B

 22:     A,B are nxk (ld=n)
 23:  */
 24: static PetscErrorCode BVAXPY_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscScalar beta,PetscScalar *d_B)
 25: {
 27:   PetscBLASInt   m,one=1;
 28:   cublasStatus_t cberr;

 31:   PetscBLASIntCast(n_*k_,&m);
 32:   if (beta!=(PetscScalar)1.0) {
 33:     cberr = cublasXscal(cublasv2handle,m,&beta,d_B,one);CHKERRCUBLAS(cberr);
 34:     PetscLogFlops(m);
 35:   }
 36:   cberr = cublasXaxpy(cublasv2handle,m,&alpha,d_A,one,d_B,one);CHKERRCUBLAS(cberr);
 37:   WaitForGPU();CHKERRCUDA(ierr);
 38:   PetscLogFlops(2.0*m);
 39:   return(0);
 40: }

 42: /*
 43:     C := alpha*A*B + beta*C
 44: */
 45: PetscErrorCode BVMult_Svec_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
 46: {
 47:   PetscErrorCode    ierr;
 48:   BV_SVEC           *y = (BV_SVEC*)Y->data,*x = (BV_SVEC*)X->data;
 49:   const PetscScalar *d_px,*d_A;
 50:   PetscScalar       *d_py,*q,*d_q,*d_B,*d_C;
 51:   PetscInt          m,n,k,ldq,mq;
 52:   cublasStatus_t    cberr;
 53:   cudaError_t       err;

 56:   m = Y->n;
 57:   n = Y->k-Y->l;
 58:   k = X->k-X->l;
 59:   if (!Y->n) return(0);
 60:   VecCUDAGetArrayRead(x->v,&d_px);
 61:   if (beta==(PetscScalar)0.0) {
 62:     VecCUDAGetArrayWrite(y->v,&d_py);
 63:   } else {
 64:     VecCUDAGetArrayReadWrite(y->v,&d_py);
 65:   }
 66:   d_A = d_px+(X->nc+X->l)*X->n;
 67:   d_C = d_py+(Y->nc+Y->l)*Y->n;
 68:   if (Q) {
 69:     MatGetSize(Q,&ldq,&mq);
 70:     MatDenseGetArray(Q,&q);
 71:     err = cudaMalloc((void**)&d_q,ldq*mq*sizeof(PetscScalar));CHKERRCUDA(err);
 72:     err = cudaMemcpy(d_q,q,ldq*mq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 73:     d_B = d_q+Y->l*ldq+X->l;
 74:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,m,d_B,ldq,&beta,d_C,m);CHKERRCUBLAS(cberr);
 75:     WaitForGPU();CHKERRCUDA(ierr);
 76:     MatDenseRestoreArray(Q,&q);
 77:     err = cudaFree(d_q);CHKERRCUDA(err);
 78:     PetscLogFlops(2.0*m*n*k);
 79:   } else {
 80:     BVAXPY_BLAS_CUDA(Y,m,n,alpha,d_A,beta,d_C);
 81:   }
 82:   VecCUDARestoreArrayRead(x->v,&d_px);
 83:   VecCUDARestoreArrayWrite(y->v,&d_py);
 84:   return(0);
 85: }

 87: /*
 88:     y := alpha*A*x + beta*y
 89: */
 90: PetscErrorCode BVMultVec_Svec_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
 91: {
 92:   PetscErrorCode    ierr;
 93:   BV_SVEC           *x = (BV_SVEC*)X->data;
 94:   const PetscScalar *d_px,*d_A;
 95:   PetscScalar       *d_py,*d_q,*d_x,*d_y;
 96:   PetscBLASInt      n,k,one=1;
 97:   cublasStatus_t    cberr;

100:   n = X->n;
101:   k = X->k-X->l;
102:   VecCUDAGetArrayRead(x->v,&d_px);
103:   if (beta==(PetscScalar)0.0) {
104:     VecCUDAGetArrayWrite(y,&d_py);
105:   } else {
106:     VecCUDAGetArrayReadWrite(y,&d_py);
107:   }
108:   if (!q) {
109:     VecCUDAGetArrayReadWrite(X->buffer,&d_q);
110:   } else {
111:     cudaMalloc((void**)&d_q,k*sizeof(PetscScalar));
112:     cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice);
113:   }
114:   d_A = d_px+(X->nc+X->l)*X->n;
115:   d_x = d_q;
116:   d_y = d_py;
117:   cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,n,d_x,one,&beta,d_y,one);CHKERRCUBLAS(cberr);
118:   WaitForGPU();CHKERRCUDA(ierr);
119:   VecCUDARestoreArrayRead(x->v,&d_px);
120:   if (beta==(PetscScalar)0.0) {
121:     VecCUDARestoreArrayWrite(y,&d_py);
122:   } else {
123:     VecCUDARestoreArrayReadWrite(y,&d_py);
124:   }
125:   if (!q) {
126:     VecCUDARestoreArrayReadWrite(X->buffer,&d_q);
127:   } else {
128:     cudaFree(d_q);
129:   }
130:   PetscLogFlops(2.0*n*k);
131:   return(0);
132: }

134: /*
135:     A(:,s:e-1) := A*B(:,s:e-1)
136: */
137: PetscErrorCode BVMultInPlace_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
138: {
140:   BV_SVEC        *ctx = (BV_SVEC*)V->data;
141:   PetscScalar    *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
142:   PetscInt       m,n,j,k,l,ldq,nq,bs=BLOCKSIZE;
143:   cublasStatus_t cberr;
144:   size_t         freemem,totmem;

147:   m = V->n;
148:   n = e-s;
149:   k = V->k-V->l;
150:   if (!m) return(0);
151:   MatGetSize(Q,&ldq,&nq);
152:   VecCUDAGetArrayReadWrite(ctx->v,&d_pv);
153:   MatDenseGetArray(Q,&q);
154:   cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));
155:   cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
156:   /* try to allocate the whole matrix */
157:   cudaMemGetInfo(&freemem,&totmem);
158:   if (freemem>=m*n*sizeof(PetscScalar)) {
159:     cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
160:     d_A = d_pv+(V->nc+V->l)*m;
161:     d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
162:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,m);CHKERRCUBLAS(cberr);
163:     for (j=0;j<n;j++) {
164:       cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
165:     }
166:   } else {
167:     bs = freemem/(m*sizeof(PetscScalar));
168:     cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar));
169:     l = m % bs;
170:     if (l) {
171:       d_A = d_pv+(V->nc+V->l)*m;
172:       d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
173:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,l,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,l);CHKERRCUBLAS(cberr);
174:       for (j=0;j<n;j++) {
175:         cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*l),l*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
176:       }
177:     }
178:     for (;l<m;l+=bs) {
179:       d_A = d_pv+(V->nc+V->l)*m+l;
180:       d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
181:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,bs,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,bs);CHKERRCUBLAS(cberr);
182:       for (j=0;j<n;j++) {
183:         cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*bs),bs*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
184:       }
185:     }
186:   }

188:   WaitForGPU();CHKERRCUDA(ierr);
189:   MatDenseRestoreArray(Q,&q);
190:   cudaFree(d_q);
191:   cudaFree(d_work);
192:   VecCUDARestoreArrayReadWrite(ctx->v,&d_pv);
193:   PetscLogFlops(2.0*m*n*k);
194:   return(0);
195: }

197: /*
198:     A(:,s:e-1) := A*B(:,s:e-1)
199: */
200: PetscErrorCode BVMultInPlaceTranspose_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
201: {
203:   BV_SVEC        *ctx = (BV_SVEC*)V->data;
204:   PetscScalar    *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
205:   PetscInt       m,n,j,k,ldq,nq;
206:   cublasStatus_t cberr;

209:   m = V->n;
210:   n = e-s;
211:   k = V->k-V->l;
212:   if (!m) return(0);
213:   MatGetSize(Q,&ldq,&nq);
214:   VecCUDAGetArrayReadWrite(ctx->v,&d_pv);
215:   MatDenseGetArray(Q,&q);
216:   cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));
217:   cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
218:   cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
219:   d_A = d_pv+(V->nc+V->l)*m;
220:   d_B = d_q+V->l*ldq+s;
221:   cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_C,m,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,m);CHKERRCUBLAS(cberr);
222:   for (j=0;j<n;j++) {
223:     cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
224:   }
225:   WaitForGPU();CHKERRCUDA(ierr);
226:   MatDenseRestoreArray(Q,&q);
227:   cudaFree(d_q);
228:   cudaFree(d_work);
229:   VecCUDARestoreArrayReadWrite(ctx->v,&d_pv);
230:   PetscLogFlops(2.0*m*n*k);
231:   return(0);
232: }

234: /*
235:     C := A'*B
236: */
237: PetscErrorCode BVDot_Svec_CUDA(BV X,BV Y,Mat M)
238: {
239:   PetscErrorCode    ierr;
240:   BV_SVEC           *x = (BV_SVEC*)X->data,*y = (BV_SVEC*)Y->data;
241:   const PetscScalar *d_px,*d_py,*d_A,*d_B;
242:   PetscScalar       *pm,*d_work,sone=1.0,szero=0.0,*C,*CC;
243:   PetscInt          ldm,m,n,k,len,j;
244:   cublasStatus_t    cberr;

247:   m = Y->k-Y->l;
248:   n = X->k-X->l;
249:   k = X->n;
250:   MatGetSize(M,&ldm,NULL);
251:   VecCUDAGetArrayRead(x->v,&d_px);
252:   VecCUDAGetArrayRead(y->v,&d_py);
253:   MatDenseGetArray(M,&pm);
254:   cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
255:   d_A = d_py+(Y->nc+Y->l)*Y->n;
256:   d_B = d_px+(X->nc+X->l)*X->n;
257:   C = pm+X->l*ldm+Y->l;
258:   if (x->mpi) {
259:     if (ldm==m) {
260:       BVAllocateWork_Private(X,m*n);
261:       if (k) {
262:         cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,ldm);CHKERRCUBLAS(cberr);
263:         cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
264:       } else {
265:         PetscMemzero(X->work,m*n*sizeof(PetscScalar));
266:       }
267:       PetscMPIIntCast(m*n,&len);
268:       MPI_Allreduce(X->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
269:     } else {
270:       BVAllocateWork_Private(X,2*m*n);
271:       CC = X->work+m*n;
272:       if (k) {
273:         cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
274:         cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
275:       } else {
276:         PetscMemzero(X->work,m*n*sizeof(PetscScalar));
277:       }
278:       PetscMPIIntCast(m*n,&len);
279:       MPI_Allreduce(X->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
280:       for (j=0;j<n;j++) {
281:         PetscMemcpy(C+j*ldm,CC+j*m,m*sizeof(PetscScalar));
282:       }
283:     }
284:   } else {
285:     if (k) {
286:       BVAllocateWork_Private(X,m*n);
287:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
288:       cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
289:       for (j=0;j<n;j++) {
290:         PetscMemcpy(C+j*ldm,X->work+j*m,m*sizeof(PetscScalar));
291:       }
292:     }
293:   }
294:   WaitForGPU();CHKERRCUDA(ierr);
295:   cudaFree(d_work);
296:   MatDenseRestoreArray(M,&pm);
297:   VecCUDARestoreArrayRead(x->v,&d_px);
298:   VecCUDARestoreArrayRead(y->v,&d_py);
299:   PetscLogFlops(2.0*m*n*k);
300:   return(0);
301: }

303: /*
304:     y := A'*x
305: */
306: PetscErrorCode BVDotVec_Svec_CUDA(BV X,Vec y,PetscScalar *q)
307: {
308:   PetscErrorCode    ierr;
309:   BV_SVEC           *x = (BV_SVEC*)X->data;
310:   const PetscScalar *d_A,*d_x,*d_px,*d_py;
311:   PetscScalar       *d_work,szero=0.0,sone=1.0,*qq=q;
312:   PetscBLASInt      n,k,one=1,len;
313:   Vec               z = y;
314:   cublasStatus_t    cberr;

317:   n = X->n;
318:   k = X->k-X->l;
319:   if (X->matrix) {
320:     BV_IPMatMult(X,y);
321:     z = X->Bx;
322:   }
323:   VecCUDAGetArrayRead(x->v,&d_px);
324:   VecCUDAGetArrayRead(z,&d_py);
325:   if (!q) {
326:     VecCUDAGetArrayWrite(X->buffer,&d_work);
327:   } else {
328:     cudaMalloc((void**)&d_work,k*sizeof(PetscScalar));
329:   }
330:   d_A = d_px+(X->nc+X->l)*X->n;
331:   d_x = d_py;
332:   if (x->mpi) {
333:     BVAllocateWork_Private(X,k);
334:     if (n) {
335:       cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_C,n,k,&sone,d_A,n,d_x,one,&szero,d_work,one);CHKERRCUBLAS(cberr);
336:       cudaMemcpy(X->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
337:     } else {
338:       PetscMemzero(X->work,k*sizeof(PetscScalar));
339:     }
340:     if (!q) {
341:       VecCUDARestoreArrayWrite(X->buffer,&d_work);
342:       VecGetArray(X->buffer,&qq);
343:     } else {
344:       cudaFree(d_work);
345:     }
346:     PetscMPIIntCast(k,&len);
347:     MPI_Allreduce(X->work,qq,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
348:     if (!q) { VecRestoreArray(X->buffer,&qq); }
349:   } else {
350:     if (n) {
351:       cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_C,n,k,&sone,d_A,n,d_x,one,&szero,d_work,one);CHKERRCUBLAS(cberr);
352:     }
353:     if (!q) {
354:       VecCUDARestoreArrayWrite(X->buffer,&d_work);
355:     } else {
356:       cudaMemcpy(q,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
357:       cudaFree(d_work);
358:     }
359:   }
360:   WaitForGPU();CHKERRCUDA(ierr);
361:   VecCUDARestoreArrayRead(z,&d_py);
362:   VecCUDARestoreArrayRead(x->v,&d_px);
363:   PetscLogFlops(2.0*n*k);
364:   return(0);
365: }

367: /*
368:     y := A'*x
369: */
370: PetscErrorCode BVDotVec_Local_Svec_CUDA(BV X,Vec y,PetscScalar *m)
371: {
372:   PetscErrorCode    ierr;
373:   BV_SVEC           *x = (BV_SVEC*)X->data;
374:   const PetscScalar *d_A,*d_x,*d_px,*d_py;
375:   PetscScalar       *d_y,szero=0.0,sone=1.0;
376:   PetscBLASInt      n,k,one=1;
377:   Vec               z = y;
378:   cublasStatus_t    cberr;

381:   n = X->n;
382:   k = X->k-X->l;
383:   if (X->matrix) {
384:     BV_IPMatMult(X,y);
385:     z = X->Bx;
386:   }
387:   VecCUDAGetArrayRead(x->v,&d_px);
388:   VecCUDAGetArrayRead(z,&d_py);
389:   d_A = d_px+(X->nc+X->l)*X->n;
390:   d_x = d_py;
391:   if (n) {
392:     cudaMalloc((void**)&d_y,k*sizeof(PetscScalar));
393:     cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_C,n,k,&sone,d_A,n,d_x,one,&szero,d_y,one);CHKERRCUBLAS(cberr);
394:     WaitForGPU();CHKERRCUDA(ierr);
395:     cudaMemcpy(m,d_y,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
396:   }
397:   cudaFree(d_y);
398:   VecCUDARestoreArrayRead(z,&d_py);
399:   VecCUDARestoreArrayRead(x->v,&d_px);
400:   PetscLogFlops(2.0*n*k);
401:   return(0);
402: }

404: /*
405:     Scale n scalars
406: */
407: PetscErrorCode BVScale_Svec_CUDA(BV bv,PetscInt j,PetscScalar alpha)
408: {
410:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
411:   PetscScalar    *d_array, *d_A;
412:   PetscBLASInt   n,one=1;
413:   cublasStatus_t cberr;

416:   VecCUDAGetArrayReadWrite(ctx->v,&d_array);
417:   if (j<0) {
418:     d_A = d_array+(bv->nc+bv->l)*bv->n;
419:     n = (bv->k-bv->l)*bv->n;
420:   } else {
421:     d_A = d_array+(bv->nc+j)*bv->n;
422:     n = bv->n;
423:   }
424:   if (alpha == (PetscScalar)0.0) {
425:     cudaMemset(d_A,0,n*sizeof(PetscScalar));
426:   } else if (alpha != (PetscScalar)1.0) {
427:     cberr = cublasXscal(cublasv2handle,n,&alpha,d_A,one);CHKERRCUBLAS(cberr);
428:     WaitForGPU();CHKERRCUDA(ierr);
429:     PetscLogFlops(n);
430:   }
431:   VecCUDARestoreArrayReadWrite(ctx->v,&d_array);
432:   return(0);
433: }

435: PetscErrorCode BVMatMult_Svec_CUDA(BV V,Mat A,BV W)
436: {
437:   PetscErrorCode    ierr;
438:   BV_SVEC           *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
439:   const PetscScalar *d_pv;
440:   PetscScalar       *d_pw;
441:   PetscInt          j;

444:   VecCUDAGetArrayRead(v->v,&d_pv);
445:   VecCUDAGetArrayWrite(w->v,&d_pw);
446:   for (j=0;j<V->k-V->l;j++) {
447:     VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->n);
448:     VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->n);
449:     MatMult(A,V->cv[1],W->cv[1]);
450:     VecCUDAResetArray(V->cv[1]);
451:     VecCUDAResetArray(W->cv[1]);
452:   }
453:   VecCUDARestoreArrayRead(v->v,&d_pv);
454:   VecCUDARestoreArrayWrite(w->v,&d_pw);
455:   return(0);
456: }

458: PetscErrorCode BVCopy_Svec_CUDA(BV V,BV W)
459: {
460:   PetscErrorCode    ierr;
461:   BV_SVEC           *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
462:   const PetscScalar *d_pv,*d_pvc;
463:   PetscScalar       *d_pw,*d_pwc;
464:   cudaError_t       err;

467:   VecCUDAGetArrayRead(v->v,&d_pv);
468:   VecCUDAGetArrayWrite(w->v,&d_pw);
469:   d_pvc = d_pv+(V->nc+V->l)*V->n;
470:   d_pwc = d_pw+(W->nc+W->l)*W->n;
471:   err = cudaMemcpy(d_pwc,d_pvc,(V->k-V->l)*V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
472:   VecCUDARestoreArrayRead(v->v,&d_pv);
473:   VecCUDARestoreArrayWrite(w->v,&d_pw);
474:   return(0);
475: }

477: PetscErrorCode BVResize_Svec_CUDA(BV bv,PetscInt m,PetscBool copy)
478: {
479:   PetscErrorCode    ierr;
480:   BV_SVEC           *ctx = (BV_SVEC*)bv->data;
481:   const PetscScalar *d_pv;
482:   PetscScalar       *d_pnew;
483:   PetscInt          bs;
484:   Vec               vnew;
485:   char              str[50];
486:   cudaError_t       err;

489:   VecGetBlockSize(bv->t,&bs);
490:   VecCreate(PetscObjectComm((PetscObject)bv->t),&vnew);
491:   VecSetType(vnew,((PetscObject)bv->t)->type_name);
492:   VecSetSizes(vnew,m*bv->n,PETSC_DECIDE);
493:   VecSetBlockSize(vnew,bs);
494:   PetscLogObjectParent((PetscObject)bv,(PetscObject)vnew);
495:   if (((PetscObject)bv)->name) {
496:     PetscSNPrintf(str,50,"%s_0",((PetscObject)bv)->name);
497:     PetscObjectSetName((PetscObject)vnew,str);
498:   }
499:   if (copy) {
500:     VecCUDAGetArrayRead(ctx->v,&d_pv);
501:     VecCUDAGetArrayWrite(vnew,&d_pnew);
502:     err = cudaMemcpy(d_pnew,d_pv,PetscMin(m,bv->m)*bv->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
503:     VecCUDARestoreArrayRead(ctx->v,&d_pv);
504:     VecCUDARestoreArrayWrite(vnew,&d_pnew);
505:   }
506:   VecDestroy(&ctx->v);
507:   ctx->v = vnew;
508:   return(0);
509: }

511: PetscErrorCode BVGetColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
512: {
514:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
515:   PetscScalar    *d_pv;
516:   PetscInt       l;

519:   l = BVAvailableVec;
520:   VecCUDAGetArrayReadWrite(ctx->v,&d_pv);
521:   VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->n);
522:   return(0);
523: }

525: PetscErrorCode BVRestoreColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
526: {
528:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
529:   PetscInt       l;

532:   l = (j==bv->ci[0])? 0: 1;
533:   VecCUDAResetArray(bv->cv[l]);
534:   VecCUDARestoreArrayReadWrite(ctx->v,NULL);
535:   return(0);
536: }