Actual source code: bvorthogcuda.cu
 
   slepc-3.19.1 2023-05-15
   
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV orthogonalization routines (CUDA)
 12: */
 14: #include <slepc/private/bvimpl.h>
 15: #include <slepcblaslapack.h>
 16: #include <slepccublas.h>
 18: /*
 19:    BV_CleanCoefficients_CUDA - Sets to zero all entries of column j of the bv buffer
 20: */
 21: PetscErrorCode BV_CleanCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h)
 22: {
 23:   PetscScalar    *d_hh,*d_a;
 24:   PetscInt       i;
 26:   PetscFunctionBegin;
 27:   if (!h) {
 28:     PetscCall(VecCUDAGetArray(bv->buffer,&d_a));
 29:     PetscCall(PetscLogGpuTimeBegin());
 30:     d_hh = d_a + j*(bv->nc+bv->m);
 31:     PetscCallCUDA(cudaMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar)));
 32:     PetscCall(PetscLogGpuTimeEnd());
 33:     PetscCall(VecCUDARestoreArray(bv->buffer,&d_a));
 34:   } else { /* cpu memory */
 35:     for (i=0;i<bv->nc+j;i++) h[i] = 0.0;
 36:   }
 37:   PetscFunctionReturn(PETSC_SUCCESS);
 38: }
 40: /*
 41:    BV_AddCoefficients_CUDA - Add the contents of the scratch (0-th column) of the bv buffer
 42:    into column j of the bv buffer
 43:  */
 44: PetscErrorCode BV_AddCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c)
 45: {
 46:   PetscScalar    *d_h,*d_c,sone=1.0;
 47:   PetscInt       i;
 48:   PetscCuBLASInt idx=0,one=1;
 49:   cublasHandle_t cublasv2handle;
 51:   PetscFunctionBegin;
 52:   if (!h) {
 53:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
 54:     PetscCall(VecCUDAGetArray(bv->buffer,&d_c));
 55:     d_h = d_c + j*(bv->nc+bv->m);
 56:     PetscCall(PetscCuBLASIntCast(bv->nc+j,&idx));
 57:     PetscCall(PetscLogGpuTimeBegin());
 58:     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,idx,&sone,d_c,one,d_h,one));
 59:     PetscCall(PetscLogGpuTimeEnd());
 60:     PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j)));
 61:     PetscCall(VecCUDARestoreArray(bv->buffer,&d_c));
 62:   } else { /* cpu memory */
 63:     for (i=0;i<bv->nc+j;i++) h[i] += c[i];
 64:     PetscCall(PetscLogFlops(1.0*(bv->nc+j)));
 65:   }
 66:   PetscFunctionReturn(PETSC_SUCCESS);
 67: }
 69: /*
 70:    BV_SetValue_CUDA - Sets value in row j (counted after the constraints) of column k
 71:    of the coefficients array
 72: */
 73: PetscErrorCode BV_SetValue_CUDA(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value)
 74: {
 75:   PetscScalar    *d_h,*a;
 77:   PetscFunctionBegin;
 78:   if (!h) {
 79:     PetscCall(VecCUDAGetArray(bv->buffer,&a));
 80:     PetscCall(PetscLogGpuTimeBegin());
 81:     d_h = a + k*(bv->nc+bv->m) + bv->nc+j;
 82:     PetscCallCUDA(cudaMemcpy(d_h,&value,sizeof(PetscScalar),cudaMemcpyHostToDevice));
 83:     PetscCall(PetscLogCpuToGpu(sizeof(PetscScalar)));
 84:     PetscCall(PetscLogGpuTimeEnd());
 85:     PetscCall(VecCUDARestoreArray(bv->buffer,&a));
 86:   } else { /* cpu memory */
 87:     h[bv->nc+j] = value;
 88:   }
 89:   PetscFunctionReturn(PETSC_SUCCESS);
 90: }
 92: /*
 93:    BV_SquareSum_CUDA - Returns the value h'*h, where h represents the contents of the
 94:    coefficients array (up to position j)
 95: */
 96: PetscErrorCode BV_SquareSum_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum)
 97: {
 98:   const PetscScalar *d_h;
 99:   PetscScalar       dot;
100:   PetscInt          i;
101:   PetscCuBLASInt    idx=0,one=1;
102:   cublasHandle_t    cublasv2handle;
104:   PetscFunctionBegin;
105:   if (!h) {
106:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
107:     PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_h));
108:     PetscCall(PetscCuBLASIntCast(bv->nc+j,&idx));
109:     PetscCall(PetscLogGpuTimeBegin());
110:     PetscCallCUBLAS(cublasXdotc(cublasv2handle,idx,d_h,one,d_h,one,&dot));
111:     PetscCall(PetscLogGpuTimeEnd());
112:     PetscCall(PetscLogGpuFlops(2.0*(bv->nc+j)));
113:     *sum = PetscRealPart(dot);
114:     PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_h));
115:   } else { /* cpu memory */
116:     *sum = 0.0;
117:     for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i]));
118:     PetscCall(PetscLogFlops(2.0*(bv->nc+j)));
119:   }
120:   PetscFunctionReturn(PETSC_SUCCESS);
121: }
123: /* pointwise multiplication */
124: __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
125: {
126:   PetscInt x;
128:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
129:   if (x<n) a[x] *= PetscRealPart(b[x]);
130: }
132: /* pointwise division */
133: __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
134: {
135:   PetscInt x;
137:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
138:   if (x<n) a[x] /= PetscRealPart(b[x]);
139: }
141: /*
142:    BV_ApplySignature_CUDA - Computes the pointwise product h*omega, where h represents
143:    the contents of the coefficients array (up to position j) and omega is the signature;
144:    if inverse=TRUE then the operation is h/omega
145: */
146: PetscErrorCode BV_ApplySignature_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse)
147: {
148:   PetscScalar       *d_h;
149:   const PetscScalar *d_omega,*omega;
150:   PetscInt          i,xcount;
151:   dim3              blocks3d, threads3d;
153:   PetscFunctionBegin;
154:   if (!(bv->nc+j)) PetscFunctionReturn(PETSC_SUCCESS);
155:   if (!h) {
156:     PetscCall(VecCUDAGetArray(bv->buffer,&d_h));
157:     PetscCall(VecCUDAGetArrayRead(bv->omega,&d_omega));
158:     PetscCall(SlepcKernelSetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount));
159:     PetscCall(PetscLogGpuTimeBegin());
160:     if (inverse) {
161:       for (i=0;i<xcount;i++) PointwiseDiv_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
162:     } else {
163:       for (i=0;i<xcount;i++) PointwiseMult_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
164:     }
165:     PetscCallCUDA(cudaGetLastError());
166:     PetscCall(PetscLogGpuTimeEnd());
167:     PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j)));
168:     PetscCall(VecCUDARestoreArrayRead(bv->omega,&d_omega));
169:     PetscCall(VecCUDARestoreArray(bv->buffer,&d_h));
170:   } else {
171:     PetscCall(VecGetArrayRead(bv->omega,&omega));
172:     if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]);
173:     else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]);
174:     PetscCall(VecRestoreArrayRead(bv->omega,&omega));
175:     PetscCall(PetscLogFlops(1.0*(bv->nc+j)));
176:   }
177:   PetscFunctionReturn(PETSC_SUCCESS);
178: }
180: /*
181:    BV_SquareRoot_CUDA - Returns the square root of position j (counted after the constraints)
182:    of the coefficients array
183: */
184: PetscErrorCode BV_SquareRoot_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta)
185: {
186:   const PetscScalar *d_h;
187:   PetscScalar       hh;
189:   PetscFunctionBegin;
190:   if (!h) {
191:     PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_h));
192:     PetscCall(PetscLogGpuTimeBegin());
193:     PetscCallCUDA(cudaMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),cudaMemcpyDeviceToHost));
194:     PetscCall(PetscLogGpuToCpu(sizeof(PetscScalar)));
195:     PetscCall(PetscLogGpuTimeEnd());
196:     PetscCall(BV_SafeSqrt(bv,hh,beta));
197:     PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_h));
198:   } else PetscCall(BV_SafeSqrt(bv,h[bv->nc+j],beta));
199:   PetscFunctionReturn(PETSC_SUCCESS);
200: }
202: /*
203:    BV_StoreCoefficients_CUDA - Copy the contents of the coefficients array to an array dest
204:    provided by the caller (only values from l to j are copied)
205: */
206: PetscErrorCode BV_StoreCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest)
207: {
208:   const PetscScalar *d_h,*d_a;
209:   PetscInt          i;
211:   PetscFunctionBegin;
212:   if (!h) {
213:     PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_a));
214:     PetscCall(PetscLogGpuTimeBegin());
215:     d_h = d_a + j*(bv->nc+bv->m)+bv->nc;
216:     PetscCallCUDA(cudaMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
217:     PetscCall(PetscLogGpuToCpu((j-bv->l)*sizeof(PetscScalar)));
218:     PetscCall(PetscLogGpuTimeEnd());
219:     PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_a));
220:   } else {
221:     for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i];
222:   }
223:   PetscFunctionReturn(PETSC_SUCCESS);
224: }