Actual source code: fnexp.c
slepc-3.10.2 2019-02-11
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2018, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: Exponential function exp(x)
12: */
14: #include <slepc/private/fnimpl.h> /*I "slepcfn.h" I*/
15: #include <slepcblaslapack.h>
17: PetscErrorCode FNEvaluateFunction_Exp(FN fn,PetscScalar x,PetscScalar *y)
18: {
20: *y = PetscExpScalar(x);
21: return(0);
22: }
24: PetscErrorCode FNEvaluateDerivative_Exp(FN fn,PetscScalar x,PetscScalar *y)
25: {
27: *y = PetscExpScalar(x);
28: return(0);
29: }
31: #define MAX_PADE 6
32: #define SWAP(a,b,t) {t=a;a=b;b=t;}
34: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade(FN fn,Mat A,Mat B)
35: {
36: #if defined(PETSC_MISSING_LAPACK_GESV) || defined(SLEPC_MISSING_LAPACK_LANGE)
38: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GESV/LANGE - Lapack routines are unavailable");
39: #else
41: PetscBLASInt n,ld,ld2,*ipiv,info,inc=1;
42: PetscInt m,j,k,sexp;
43: PetscBool odd;
44: const PetscInt p=MAX_PADE;
45: PetscReal c[MAX_PADE+1],s,*rwork;
46: PetscScalar scale,mone=-1.0,one=1.0,two=2.0,zero=0.0;
47: PetscScalar *Aa,*Ba,*As,*A2,*Q,*P,*W,*aux;
50: MatDenseGetArray(A,&Aa);
51: MatDenseGetArray(B,&Ba);
52: MatGetSize(A,&m,NULL);
53: PetscBLASIntCast(m,&n);
54: ld = n;
55: ld2 = ld*ld;
56: P = Ba;
57: PetscMalloc6(m*m,&Q,m*m,&W,m*m,&As,m*m,&A2,ld,&rwork,ld,&ipiv);
58: PetscMemcpy(As,Aa,ld2*sizeof(PetscScalar));
60: /* Pade' coefficients */
61: c[0] = 1.0;
62: for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
64: /* Scaling */
65: s = LAPACKlange_("I",&n,&n,As,&ld,rwork);
66: PetscLogFlops(1.0*n*n);
67: if (s>0.5) {
68: sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
69: scale = PetscPowRealInt(2.0,-sexp);
70: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&scale,As,&inc));
71: PetscLogFlops(1.0*n*n);
72: } else sexp = 0;
74: /* Horner evaluation */
75: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,As,&ld,As,&ld,&zero,A2,&ld));
76: PetscLogFlops(2.0*n*n*n);
77: PetscMemzero(Q,ld2*sizeof(PetscScalar));
78: PetscMemzero(P,ld2*sizeof(PetscScalar));
79: for (j=0;j<n;j++) {
80: Q[j+j*ld] = c[p];
81: P[j+j*ld] = c[p-1];
82: }
84: odd = PETSC_TRUE;
85: for (k=p-1;k>0;k--) {
86: if (odd) {
87: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,A2,&ld,&zero,W,&ld));
88: SWAP(Q,W,aux);
89: for (j=0;j<n;j++) Q[j+j*ld] += c[k-1];
90: odd = PETSC_FALSE;
91: } else {
92: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,A2,&ld,&zero,W,&ld));
93: SWAP(P,W,aux);
94: for (j=0;j<n;j++) P[j+j*ld] += c[k-1];
95: odd = PETSC_TRUE;
96: }
97: PetscLogFlops(2.0*n*n*n);
98: }
99: if (odd) {
100: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,As,&ld,&zero,W,&ld));
101: SWAP(Q,W,aux);
102: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
103: PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
104: SlepcCheckLapackInfo("gesv",info);
105: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
106: for (j=0;j<n;j++) P[j+j*ld] += 1.0;
107: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&mone,P,&inc));
108: } else {
109: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,As,&ld,&zero,W,&ld));
110: SWAP(P,W,aux);
111: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
112: PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
113: SlepcCheckLapackInfo("gesv",info);
114: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
115: for (j=0;j<n;j++) P[j+j*ld] += 1.0;
116: }
117: PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);
119: for (k=1;k<=sexp;k++) {
120: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,P,&ld,&zero,W,&ld));
121: PetscMemcpy(P,W,ld2*sizeof(PetscScalar));
122: }
123: if (P!=Ba) { PetscMemcpy(Ba,P,ld2*sizeof(PetscScalar)); }
124: PetscLogFlops(2.0*n*n*n*sexp);
126: PetscFree6(Q,W,As,A2,rwork,ipiv);
127: MatDenseRestoreArray(A,&Aa);
128: MatDenseRestoreArray(B,&Ba);
129: return(0);
130: #endif
131: }
133: #define PARTIAL_FRACTION_FORM 0
134: #define PRODUCT_FORM 1
136: /*
137: * Set scaling factor (s) and Pade degree (k,m)
138: */
139: static PetscErrorCode sexpm_params(PetscReal nrm,PetscInt mode,PetscInt *s,PetscInt *k,PetscInt *m)
140: {
142: if (nrm>1) {
143: if (nrm<200) {*s = 4; *k = 5; *m = *k-1;}
144: else if (nrm<1e4) {*s = 4; *k = 4; *m = *k+1;}
145: else if (nrm<1e6) {*s = 4; *k = 3; *m = *k+1;}
146: else if (nrm<1e9) {*s = 3; *k = 3; *m = *k+1;}
147: else if (nrm<1e11) {*s = 2; *k = 3; *m = *k+1;}
148: else if (nrm<1e12) {*s = 2; *k = 2; *m = *k+1;}
149: else if (nrm<1e14) {*s = 2; *k = 1; *m = *k+1;}
150: else {*s = 1; *k = 1; *m = *k+1;}
151: } else { /* nrm<1 */
152: if (nrm>0.5) {*s = 4; *k = 4; *m = *k-1;}
153: else if (nrm>0.3) {*s = 3; *k = 4; *m = *k-1;}
154: else if (nrm>0.15) {*s = 2; *k = 4; *m = *k-1;}
155: else if (nrm>0.07) {*s = 1; *k = 4; *m = *k-1;}
156: else if (nrm>0.01) {*s = 0; *k = 4; *m = *k-1;}
157: else if (nrm>3e-4) {*s = 0; *k = 3; *m = *k-1;}
158: else if (nrm>1e-5) {*s = 0; *k = 3; *m = 0;}
159: else if (nrm>1e-8) {*s = 0; *k = 2; *m = 0;}
160: else {*s = 0; *k = 1; *m = 0;}
161: }
162: return(0);
163: }
165: #if defined(PETSC_HAVE_COMPLEX)
166: /*
167: * Partial fraction form coefficients.
168: * If query, the function returns the size necessary to store the coefficients.
169: */
170: static PetscErrorCode getcoeffs(PetscInt k,PetscInt m,PetscComplex *r,PetscComplex *q,PetscComplex *remain,PetscBool query)
171: {
172: PetscInt i;
173: const PetscComplex /* m == k+1 */
174: p1r4[5] = {-1.582680186458572e+01 - 2.412564578224361e+01*PETSC_i,
175: -1.582680186458572e+01 + 2.412564578224361e+01*PETSC_i,
176: 1.499984465975511e+02 + 6.804227952202417e+01*PETSC_i,
177: 1.499984465975511e+02 - 6.804227952202417e+01*PETSC_i,
178: -2.733432894659307e+02 },
179: p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
180: 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
181: 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
182: 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i,
183: 6.286704751729261e+00 },
184: p1r3[4] = {-1.130153999597152e+01 + 1.247167585025031e+01*PETSC_i,
185: -1.130153999597152e+01 - 1.247167585025031e+01*PETSC_i,
186: 1.330153999597152e+01 - 6.007173273704750e+01*PETSC_i,
187: 1.330153999597152e+01 + 6.007173273704750e+01*PETSC_i},
188: p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
189: 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
190: 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
191: 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
192: p1r2[3] = { 7.648749087422928e+00 + 4.171640244747463e+00*PETSC_i,
193: 7.648749087422928e+00 - 4.171640244747463e+00*PETSC_i,
194: -1.829749817484586e+01 },
195: p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
196: 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
197: 3.637834252744491e+00 },
198: p1r1[2] = { 1.000000000000000e+00 - 3.535533905932738e+00*PETSC_i,
199: 1.000000000000000e+00 + 3.535533905932738e+00*PETSC_i},
200: p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
201: 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
202: const PetscComplex /* m == k-1 */
203: m1r5[4] = {-1.423367961376821e+02 - 1.385465094833037e+01*PETSC_i,
204: -1.423367961376821e+02 + 1.385465094833037e+01*PETSC_i,
205: 2.647367961376822e+02 - 4.814394493714596e+02*PETSC_i,
206: 2.647367961376822e+02 + 4.814394493714596e+02*PETSC_i},
207: m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
208: 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
209: 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
210: 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
211: m1r4[3] = { 2.484269593165883e+01 + 7.460342395992306e+01*PETSC_i,
212: 2.484269593165883e+01 - 7.460342395992306e+01*PETSC_i,
213: -2.734353918633177e+02 },
214: m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
215: 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
216: 5.648485971016893e+00 },
217: m1r3[2] = { 2.533333333333333e+01 - 2.733333333333333e+01*PETSC_i,
218: 2.533333333333333e+01 + 2.733333333333333e+01*PETSC_i},
219: m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
220: 4.000000000000000e+00 - 2.000000000000000e+00*PETSC_i};
221: const PetscScalar /* m == k-1 */
222: m1remain5[2] = { 2.000000000000000e-01, 9.800000000000000e+00},
223: m1remain4[2] = {-2.500000000000000e-01, -7.750000000000000e+00},
224: m1remain3[2] = { 3.333333333333333e-01, 5.666666666666667e+00},
225: m1remain2[2] = {-0.5, -3.5},
226: remain3[4] = {1.0/6.0, 1.0/2.0, 1, 1},
227: remain2[3] = {1.0/2.0, 1, 1};
230: if (query) { /* query about buffer's size */
231: if (m==k+1) {
232: *remain = 0;
233: if (k==4) {
234: *r = *q = 5;
235: } else if (k==3) {
236: *r = *q = 4;
237: } else if (k==2) {
238: *r = *q = 3;
239: } else if (k==1) {
240: *r = *q = 2;
241: }
242: return(0); /* quick return */
243: }
244: if (m==k-1) {
245: if (k==5) {
246: *r = *q = 4; *remain = 2;
247: } else if (k==4) {
248: *r = *q = 3; *remain = 2;
249: } else if (k==3) {
250: *r = *q = 2; *remain = 2;
251: } else if (k==2) {
252: *r = *q = 1; *remain = 2;
253: }
254: }
255: if (m==0) {
256: *r = *q = 0;
257: if (k==3) {
258: *remain = 4;
259: } else if (k==2) {
260: *remain = 3;
261: }
262: }
263: } else {
264: if (m==k+1) {
265: if (k==4) {
266: for (i=0;i<5;i++) {
267: r[i] = p1r4[i]; q[i] = p1q4[i];
268: }
269: } else if (k==3) {
270: for (i=0;i<4;i++) {
271: r[i] = p1r3[i]; q[i] = p1q3[i];
272: }
273: } else if (k==2) {
274: for (i=0;i<3;i++) {
275: r[i] = p1r2[i]; q[i] = p1q2[i];
276: }
277: } else if (k==1) {
278: for (i=0;i<2;i++) {
279: r[i] = p1r1[i]; q[i] = p1q1[i];
280: }
281: }
282: return(0); /* quick return */
283: }
284: if (m==k-1) {
285: if (k==5) {
286: for (i=0;i<4;i++) {
287: r[i] = m1r5[i]; q[i] = m1q5[i];
288: }
289: for (i=0;i<2;i++) {
290: remain[i] = m1remain5[i];
291: }
292: } else if (k==4) {
293: for (i=0;i<3;i++) {
294: r[i] = m1r4[i]; q[i] = m1q4[i];
295: }
296: for (i=0;i<2;i++) {
297: remain[i] = m1remain4[i];
298: }
299: } else if (k==3) {
300: for (i=0;i<2;i++) {
301: r[i] = m1r3[i]; q[i] = m1q3[i]; remain[i] = m1remain3[i];
302: }
303: } else if (k==2) {
304: r[0] = -13.5;
305: q[0] = 3;
306: for (i=0;i<2;i++) {
307: remain[i] = m1remain2[i];
308: }
309: }
310: }
311: if (m==0) {
312: r = q = 0;
313: if (k==3) {
314: for (i=0;i<4;i++) {
315: remain[i] = remain3[i];
316: }
317: } else if (k==2) {
318: for (i=0;i<3;i++) {
319: remain[i] = remain2[i];
320: }
321: }
322: }
323: }
324: return(0);
325: }
327: /*
328: * Product form coefficients.
329: * If query, the function returns the size necessary to store the coefficients.
330: */
331: static PetscErrorCode getcoeffsproduct(PetscInt k,PetscInt m,PetscComplex *p,PetscComplex *q,PetscComplex *mult,PetscBool query)
332: {
333: PetscInt i;
334: const PetscComplex /* m == k+1 */
335: p1p4[4] = {-5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
336: -5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
337: -6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
338: -6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
339: p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
340: 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
341: 6.286704751729261e+00 ,
342: 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
343: 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
344: p1p3[3] = {-4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
345: -4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
346: -5.648485971016893e+00 },
347: p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
348: 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
349: 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
350: 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
351: p1p2[2] = {-4.00000000000000e+00 + 2.000000000000000e+00*PETSC_i,
352: -4.00000000000000e+00 - 2.000000000000000e+00*PETSC_i},
353: p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
354: 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
355: 3.637834252744491e+00 },
356: p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
357: 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
358: const PetscComplex /* m == k-1 */
359: m1p5[5] = {-3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
360: -3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
361: -6.286704751729261e+00 ,
362: -5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
363: -5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
364: m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
365: 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
366: 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
367: 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
368: m1p4[4] = {-3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
369: -3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
370: -4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
371: -4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
372: m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
373: 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
374: 5.648485971016893e+00 },
375: m1p3[3] = {-2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
376: -2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
377: -3.637834252744491e+00 },
378: m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
379: 4.000000000000000e+00 - 2.000000000000001e+00*PETSC_i},
380: m1p2[2] = {-2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
381: -2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
384: if (query) {
385: if (m == k+1) {
386: *mult = 1;
387: if (k==4) {
388: *p = 4; *q = 5;
389: } else if (k==3) {
390: *p = 3; *q = 4;
391: } else if (k==2) {
392: *p = 2; *q = 3;
393: } else if (k==1) {
394: *p = 1; *q = 2;
395: }
396: return(0);
397: }
398: if (m==k-1) {
399: *mult = 1;
400: if (k==5) {
401: *p = 5; *q = 4;
402: } else if (k==4) {
403: *p = 4; *q = 3;
404: } else if (k==3) {
405: *p = 3; *q = 2;
406: } else if (k==2) {
407: *p = 2; *q = 1;
408: }
409: }
410: } else {
411: if (m == k+1) {
412: *mult = PetscPowInt(-1,m);
413: *mult *= m;
414: if (k==4) {
415: for (i=0;i<4;i++) {
416: p[i] = p1p4[i]; q[i] = p1q4[i];
417: }
418: q[4] = p1q4[4];
419: } else if (k==3) {
420: for (i=0;i<3;i++) {
421: p[i] = p1p3[i]; q[i] = p1q3[i];
422: }
423: q[3] = p1q3[3];
424: } else if (k==2) {
425: for (i=0;i<2;i++) {
426: p[i] = p1p2[i]; q[i] = p1q2[i];
427: }
428: q[2] = p1q2[2];
429: } else if (k==1) {
430: p[0] = -3;
431: for (i=0;i<2;i++) {
432: q[i] = p1q1[i];
433: }
434: }
435: return(0);
436: }
437: if (m==k-1) {
438: *mult = PetscPowInt(-1,m);
439: *mult /= k;
440: if (k==5) {
441: for (i=0;i<4;i++) {
442: p[i] = m1p5[i]; q[i] = m1q5[i];
443: }
444: p[4] = m1p5[4];
445: } else if (k==4) {
446: for (i=0;i<3;i++) {
447: p[i] = m1p4[i]; q[i] = m1q4[i];
448: }
449: p[3] = m1p4[3];
450: } else if (k==3) {
451: for (i=0;i<2;i++) {
452: p[i] = m1p3[i]; q[i] = m1q3[i];
453: }
454: p[2] = m1p3[2];
455: } else if (k==2) {
456: for (i=0;i<2;i++) {
457: p[i] = m1p2[i];
458: }
459: q[0] = 3;
460: }
461: }
462: }
463: return(0);
464: }
465: #endif /* PETSC_HAVE_COMPLEX */
467: #if defined(PETSC_USE_COMPLEX)
468: static PetscErrorCode getisreal(PetscInt n,PetscComplex *a,PetscBool *result)
469: {
470: PetscInt i;
473: *result=PETSC_TRUE;
474: for (i=0;i<n&&*result;i++) {
475: if (PetscImaginaryPartComplex(a[i])) *result=PETSC_FALSE;
476: }
477: return(0);
478: }
479: #endif
481: /*
482: * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
483: * and Yuji Nakatsukasa
484: *
485: * Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade
486: * Approximation for the Matrix Exponential",
487: * SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
488: * https://doi.org/10.1137/15M1027553
489: */
490: PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa(FN fn,Mat A,Mat B)
491: {
492: #if !defined(PETSC_HAVE_COMPLEX)
494: SETERRQ(PETSC_COMM_SELF,1,"This function requires C99 or C++ complex support");
495: #elif defined(PETSC_MISSING_LAPACK_GEEV) || defined(PETSC_MISSING_LAPACK_GESV) || defined(SLEPC_MISSING_LAPACK_LANGE)
497: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GEEV/GESV/LANGE - Lapack routines are unavailable");
498: #else
499: PetscInt i,j,n_,s,k,m,mode=PRODUCT_FORM,mod;
500: PetscBLASInt n,n2,irsize,rsizediv2,ipsize,iremainsize,query=-1,info,*piv,minlen,lwork,one=1;
501: PetscReal nrm,shift;
502: #if defined(PETSC_USE_COMPLEX)
503: PetscReal *rwork=NULL;
504: #endif
505: PetscComplex *As,*RR,*RR2,*expmA,*expmA2,*Maux,*Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
506: PetscScalar *Aa,*Ba,*Ba2,*sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*work,work1,*saux;
508: PetscBool isreal;
511: MatGetSize(A,&n_,NULL);
512: PetscBLASIntCast(n_,&n);
513: MatDenseGetArray(A,&Aa);
514: MatDenseGetArray(B,&Ba);
515: Ba2 = Ba;
516: PetscBLASIntCast(n*n,&n2);
518: PetscMalloc2(n2,&sMaux,n2,&Maux);
519: Maux2 = Maux;
520: PetscMalloc2(n,&wr,n,&wi);
521: PetscMemcpy(sMaux,Aa,n2*sizeof(PetscScalar));
522: /* estimate rightmost eigenvalue and shift A with it */
523: #if !defined(PETSC_USE_COMPLEX)
524: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,&work1,&query,&info));
525: SlepcCheckLapackInfo("geev",info);
526: PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
527: PetscMalloc1(lwork,&work);
528: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,work,&lwork,&info));
529: PetscFree(work);
530: #else
531: PetscMemcpy(Maux,Aa,n2*sizeof(PetscScalar));
532: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,&work1,&query,rwork,&info));
533: SlepcCheckLapackInfo("geev",info);
534: PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
535: PetscMalloc2(2*n,&rwork,lwork,&work);
536: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,work,&lwork,rwork,&info));
537: PetscFree2(rwork,work);
538: #endif
539: SlepcCheckLapackInfo("geev",info);
540: PetscLogFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n);
542: shift = PetscRealPart(wr[0]);
543: for (i=1;i<n;i++) {
544: if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
545: }
546: PetscFree2(wr,wi);
547: /* shift so that largest real part is (about) 0 */
548: PetscMemcpy(sMaux,Aa,n2*sizeof(PetscScalar));
549: for (i=0;i<n;i++) {
550: sMaux[i+i*n] -= shift;
551: }
552: PetscLogFlops(1.0*n);
553: #if defined(PETSC_USE_COMPLEX)
554: PetscMemcpy(Maux,Aa,n2*sizeof(PetscScalar));
555: for (i=0;i<n;i++) {
556: Maux[i+i*n] -= shift;
557: }
558: PetscLogFlops(1.0*n);
559: #endif
561: /* estimate norm(A) and select the scaling factor */
562: nrm = LAPACKlange_("O",&n,&n,sMaux,&n,NULL);
563: PetscLogFlops(1.0*n*n);
564: sexpm_params(nrm,mode,&s,&k,&m);
565: if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
566: expshift = PetscExpScalar(shift);
567: for (i=0;i<n;i++) {
568: sMaux[i+i*n] += 1.0;
569: }
570: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&expshift,sMaux,&one));
571: PetscLogFlops(1.0*(n+n2));
572: PetscMemcpy(Ba,sMaux,n2*sizeof(PetscScalar));
573: PetscFree(sMaux);
574: MatDenseRestoreArray(A,&Aa);
575: MatDenseRestoreArray(B,&Ba);
576: return(0); /* quick return */
577: }
579: PetscMalloc4(n2,&expmA,n2,&As,n2,&RR,n,&piv);
580: expmA2 = expmA; RR2 = RR;
581: /* scale matrix */
582: #if !defined(PETSC_USE_COMPLEX)
583: for (i=0;i<n2;i++) {
584: As[i] = sMaux[i];
585: }
586: #else
587: PetscMemcpy(As,sMaux,n2*sizeof(PetscScalar));
588: #endif
589: scale = 1.0/PetscPowRealInt(2.0,s);
590: PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&scale,As,&one));
591: SlepcLogFlopsComplex(1.0*n2);
593: /* evaluate Pade approximant (partial fraction or product form) */
594: if (mode==PARTIAL_FRACTION_FORM || !m) { /* partial fraction */
595: getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE);
596: PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
597: PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
598: PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize);
599: PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm);
600: getcoeffs(k,m,r,p,remainterm,PETSC_FALSE);
602: PetscMemzero(expmA,n2*sizeof(PetscComplex));
603: #if !defined(PETSC_USE_COMPLEX)
604: isreal = PETSC_TRUE;
605: #else
606: getisreal(n2,Maux,&isreal);
607: #endif
608: if (isreal) {
609: rsizediv2 = irsize/2;
610: for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
611: PetscMemcpy(Maux,As,n2*sizeof(PetscComplex));
612: PetscMemzero(RR,n2*sizeof(PetscComplex));
613: for (j=0;j<n;j++) {
614: Maux[j+j*n] -= p[2*i];
615: RR[j+j*n] = r[2*i];
616: }
617: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
618: SlepcCheckLapackInfo("gesv",info);
619: for (j=0;j<n2;j++) {
620: expmA[j] += RR[j] + PetscConj(RR[j]);
621: }
622: /* loop(n) + gesv + loop(n2) */
623: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2);
624: }
626: mod = ipsize % 2;
627: if (mod) {
628: PetscMemcpy(Maux,As,n2*sizeof(PetscComplex));
629: PetscMemzero(RR,n2*sizeof(PetscComplex));
630: for (j=0;j<n;j++) {
631: Maux[j+j*n] -= p[ipsize-1];
632: RR[j+j*n] = r[irsize-1];
633: }
634: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
635: SlepcCheckLapackInfo("gesv",info);
636: for (j=0;j<n2;j++) {
637: expmA[j] += RR[j];
638: }
639: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
640: }
641: } else { /* complex */
642: for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
643: PetscMemcpy(Maux,As,n2*sizeof(PetscComplex));
644: PetscMemzero(RR,n2*sizeof(PetscComplex));
645: for (j=0;j<n;j++) {
646: Maux[j+j*n] -= p[i];
647: RR[j+j*n] = r[i];
648: }
649: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
650: SlepcCheckLapackInfo("gesv",info);
651: for (j=0;j<n2;j++) {
652: expmA[j] += RR[j];
653: }
654: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
655: }
656: }
657: for (i=0;i<iremainsize;i++) {
658: if (!i) {
659: PetscMemzero(RR,n2*sizeof(PetscComplex));
660: for (j=0;j<n;j++) {
661: RR[j+j*n] = remainterm[iremainsize-1];
662: }
663: } else {
664: PetscMemcpy(RR,As,n2*sizeof(PetscComplex));
665: for (j=1;j<i;j++) {
666: PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,RR,&n,&czero,Maux,&n));
667: SWAP(RR,Maux,aux);
668: SlepcLogFlopsComplex(2.0*n*n*n);
669: }
670: PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&remainterm[iremainsize-1-i],RR,&one));
671: SlepcLogFlopsComplex(1.0*n2);
672: }
673: for (j=0;j<n2;j++) {
674: expmA[j] += RR[j];
675: }
676: SlepcLogFlopsComplex(1.0*n2);
677: }
678: PetscFree3(r,p,remainterm);
679: } else { /* product form, default */
680: getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE);
681: PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
682: PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
683: PetscMalloc2(irsize,&rootp,ipsize,&rootq);
684: getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE);
686: PetscMemzero(expmA,n2*sizeof(PetscComplex));
687: for (i=0;i<n;i++) { /* initialize */
688: expmA[i+i*n] = 1.0;
689: }
690: minlen = PetscMin(irsize,ipsize);
691: for (i=0;i<minlen;i++) {
692: PetscMemcpy(RR,As,n2*sizeof(PetscComplex));
693: for (j=0;j<n;j++) {
694: RR[j+j*n] -= rootp[i];
695: }
696: PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
697: SWAP(expmA,Maux,aux);
698: PetscMemcpy(RR,As,n2*sizeof(PetscComplex));
699: for (j=0;j<n;j++) {
700: RR[j+j*n] -= rootq[i];
701: }
702: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
703: SlepcCheckLapackInfo("gesv",info);
704: /* loop(n) + gemm + loop(n) + gesv */
705: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
706: }
707: /* extra enumerator */
708: for (i=minlen;i<irsize;i++) {
709: PetscMemcpy(RR,As,n2*sizeof(PetscComplex));
710: for (j=0;j<n;j++) {
711: RR[j+j*n] -= rootp[i];
712: }
713: PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
714: SWAP(expmA,Maux,aux);
715: SlepcLogFlopsComplex(1.0*n+2.0*n*n*n);
716: }
717: /* extra denominator */
718: for (i=minlen;i<ipsize;i++) {
719: PetscMemcpy(RR,As,n2*sizeof(PetscComplex));
720: for (j=0;j<n;j++) {
721: RR[j+j*n] -= rootq[i];
722: }
724: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
725: SlepcCheckLapackInfo("gesv",info);
726: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
727: }
728: PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&mult,expmA,&one));
729: SlepcLogFlopsComplex(1.0*n2);
730: PetscFree2(rootp,rootq);
731: }
733: #if !defined(PETSC_USE_COMPLEX)
734: for (i=0;i<n2;i++) {
735: Ba2[i] = PetscRealPartComplex(expmA[i]);
736: }
737: #else
738: PetscMemcpy(Ba2,expmA,n2*sizeof(PetscScalar));
739: #endif
741: /* perform repeated squaring */
742: for (i=0;i<s;i++) { /* final squaring */
743: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,Ba2,&n,Ba2,&n,&szero,sMaux,&n));
744: SWAP(Ba2,sMaux,saux);
745: PetscLogFlops(2.0*n*n*n);
746: }
747: if (Ba2!=Ba) {
748: PetscMemcpy(Ba,Ba2,n2*sizeof(PetscScalar));
749: sMaux = Ba2;
750: }
751: expshift = PetscExpReal(shift);
752: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&expshift,Ba,&one));
753: PetscLogFlops(1.0*n2);
755: /* restore pointers */
756: Maux = Maux2; expmA = expmA2; RR = RR2;
757: PetscFree2(sMaux,Maux);
758: PetscFree4(expmA,As,RR,piv);
759: MatDenseRestoreArray(A,&Aa);
760: MatDenseRestoreArray(B,&Ba);
761: return(0);
762: #endif
763: }
765: #define ITMAX 5
767: /*
768: * Estimate norm(A^m,1) by block 1-norm power method (required workspace is 11*n)
769: */
770: static PetscReal normest1(PetscBLASInt n,PetscScalar *A,PetscInt m,PetscScalar *work,PetscRandom rand)
771: {
772: PetscScalar *X,*Y,*Z,*S,*S_old,*aux,val,sone=1.0,szero=0.0;
773: PetscReal est=0.0,est_old,vals[2]={0.0,0.0},*zvals,maxzval[2],raux;
774: PetscBLASInt i,j,t=2,it=0,ind[2],est_j=0,m1;
778: X = work;
779: Y = work + 2*n;
780: Z = work + 4*n;
781: S = work + 6*n;
782: S_old = work + 8*n;
783: zvals = (PetscReal*)(work + 10*n);
785: for (i=0;i<n;i++) { /* X has columns of unit 1-norm */
786: X[i] = 1.0/n;
787: PetscRandomGetValue(rand,&val);
788: if (PetscRealPart(val) < 0.5) X[i+n] = -1.0/n;
789: else X[i+n] = 1.0/n;
790: }
791: for (i=0;i<t*n;i++) S[i] = 0.0;
792: ind[0] = 0; ind[1] = 0;
793: est_old = 0;
794: while (1) {
795: it++;
796: for (j=0;j<m;j++) { /* Y = A^m*X */
797: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&t,&n,&sone,A,&n,X,&n,&szero,Y,&n));
798: if (j<m-1) SWAP(X,Y,aux);
799: }
800: for (j=0;j<t;j++) { /* vals[j] = norm(Y(:,j),1) */
801: vals[j] = 0.0;
802: for (i=0;i<n;i++) vals[j] += PetscAbsScalar(Y[i+j*n]);
803: }
804: if (vals[0]<vals[1]) {
805: SWAP(vals[0],vals[1],raux);
806: m1 = 1;
807: } else m1 = 0;
808: est = vals[0];
809: if (est>est_old || it==2) est_j = ind[m1];
810: if (it>=2 && est<=est_old) {
811: est = est_old;
812: break;
813: }
814: est_old = est;
815: if (it>ITMAX) break;
816: SWAP(S,S_old,aux);
817: for (i=0;i<t*n;i++) { /* S = sign(Y) */
818: S[i] = (PetscRealPart(Y[i]) < 0.0)? -1.0: 1.0;
819: }
820: for (j=0;j<m;j++) { /* Z = (A^T)^m*S */
821: PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&n,&t,&n,&sone,A,&n,S,&n,&szero,Z,&n));
822: if (j<m-1) SWAP(S,Z,aux);
823: }
824: maxzval[0] = -1; maxzval[1] = -1;
825: ind[0] = 0; ind[1] = 0;
826: for (i=0;i<n;i++) { /* zvals[i] = norm(Z(i,:),inf) */
827: zvals[i] = PetscMax(PetscAbsScalar(Z[i+0*n]),PetscAbsScalar(Z[i+1*n]));
828: if (zvals[i]>maxzval[0]) {
829: maxzval[0] = zvals[i];
830: ind[0] = i;
831: } else if (zvals[i]>maxzval[1]) {
832: maxzval[1] = zvals[i];
833: ind[1] = i;
834: }
835: }
836: if (it>=2 && maxzval[0]==zvals[est_j]) break;
837: for (i=0;i<t*n;i++) X[i] = 0.0;
838: for (j=0;j<t;j++) X[ind[j]+j*n] = 1.0;
839: }
840: /* Flop count is roughly (it * 2*m * t*gemv) = 4*its*m*t*n*n */
841: PetscLogFlops(4.0*it*m*t*n*n);
842: PetscFunctionReturn(est);
843: }
845: #define SMALLN 100
847: /*
848: * Estimate norm(A^m,1) (required workspace is 2*n*n)
849: */
850: static PetscReal normAm(PetscBLASInt n,PetscScalar *A,PetscInt m,PetscScalar *work,PetscRandom rand)
851: {
852: PetscScalar *v=work,*w=work+n*n,*aux,sone=1.0,szero=0.0;
853: PetscReal nrm,rwork[1],tmp;
854: PetscBLASInt i,j,one=1;
855: PetscBool isrealpos=PETSC_TRUE;
859: if (n<SMALLN) { /* compute matrix power explicitly */
860: if (m==1) {
861: nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
862: PetscLogFlops(1.0*n*n);
863: } else { /* m>=2 */
864: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,A,&n,A,&n,&szero,v,&n));
865: for (j=0;j<m-2;j++) {
866: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,A,&n,v,&n,&szero,w,&n));
867: SWAP(v,w,aux);
868: }
869: nrm = LAPACKlange_("O",&n,&n,v,&n,rwork);
870: PetscLogFlops(2.0*n*n*n*(m-1)+1.0*n*n);
871: }
872: } else {
873: for (i=0;i<n;i++)
874: for (j=0;j<n;j++)
875: #if defined(PETSC_USE_COMPLEX)
876: if (PetscRealPart(A[i+j*n])<0.0 || PetscImaginaryPart(A[i+j*n])!=0.0) { isrealpos = PETSC_FALSE; break; }
877: #else
878: if (A[i+j*n]<0.0) { isrealpos = PETSC_FALSE; break; }
879: #endif
880: if (isrealpos) { /* for positive matrices only */
881: for (i=0;i<n;i++) v[i] = 1.0;
882: for (j=0;j<m;j++) { /* w = A'*v */
883: PetscStackCallBLAS("BLASgemv",BLASgemv_("C",&n,&n,&sone,A,&n,v,&one,&szero,w,&one));
884: SWAP(v,w,aux);
885: }
886: PetscLogFlops(2.0*n*n*m);
887: nrm = 0.0;
888: for (i=0;i<n;i++) if ((tmp = PetscAbsScalar(v[i])) > nrm) nrm = tmp; /* norm(v,inf) */
889: } else {
890: nrm = normest1(n,A,m,work,rand);
891: }
892: }
893: PetscFunctionReturn(nrm);
894: }
896: /*
897: * Function needed to compute optimal parameters (required workspace is 3*n*n)
898: */
899: static PetscInt ell(PetscBLASInt n,PetscScalar *A,PetscReal coeff,PetscInt m,PetscScalar *work,PetscRandom rand)
900: {
901: PetscScalar *Ascaled=work;
902: PetscReal nrm,alpha,beta,rwork[1];
903: PetscInt t;
904: PetscBLASInt i,j;
908: beta = PetscPowReal(coeff,1.0/(2*m+1));
909: for (i=0;i<n;i++)
910: for (j=0;j<n;j++)
911: Ascaled[i+j*n] = beta*PetscAbsScalar(A[i+j*n]);
912: nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
913: PetscLogFlops(2.0*n*n);
914: alpha = normAm(n,Ascaled,2*m+1,work+n*n,rand)/nrm;
915: t = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(2.0*alpha/PETSC_MACHINE_EPSILON)/PetscLogReal(2.0)/(2*m)),0);
916: PetscFunctionReturn(t);
917: }
919: /*
920: * Compute scaling parameter (s) and order of Pade approximant (m) (required workspace is 4*n*n)
921: */
922: static PetscErrorCode expm_params(PetscInt n,PetscScalar **Apowers,PetscInt *s,PetscInt *m,PetscScalar *work)
923: {
924: PetscErrorCode ierr;
925: PetscScalar sfactor,sone=1.0,szero=0.0,*A=Apowers[0],*Ascaled;
926: PetscReal d4,d6,d8,d10,eta1,eta3,eta4,eta5,rwork[1];
927: PetscBLASInt n_,n2,one=1;
928: PetscRandom rand;
929: const PetscReal coeff[5] = { 9.92063492063492e-06, 9.94131285136576e-11, /* backward error function */
930: 2.22819456055356e-16, 1.69079293431187e-22, 8.82996160201868e-36 };
931: const PetscReal theta[5] = { 1.495585217958292e-002, /* m = 3 */
932: 2.539398330063230e-001, /* m = 5 */
933: 9.504178996162932e-001, /* m = 7 */
934: 2.097847961257068e+000, /* m = 9 */
935: 5.371920351148152e+000 }; /* m = 13 */
938: *s = 0;
939: *m = 13;
940: PetscBLASIntCast(n,&n_);
941: PetscRandomCreate(PETSC_COMM_SELF,&rand);
942: d4 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[2],&n_,rwork),1.0/4.0);
943: if (d4==0.0) { /* safeguard for the case A = 0 */
944: *m = 3;
945: goto done;
946: }
947: d6 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[3],&n_,rwork),1.0/6.0);
948: PetscLogFlops(2.0*n*n);
949: eta1 = PetscMax(d4,d6);
950: if (eta1<=theta[0] && !ell(n_,A,coeff[0],3,work,rand)) {
951: *m = 3;
952: goto done;
953: }
954: if (eta1<=theta[1] && !ell(n_,A,coeff[1],5,work,rand)) {
955: *m = 5;
956: goto done;
957: }
958: if (n<SMALLN) {
959: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[2],&n_,&szero,work,&n_));
960: d8 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/8.0);
961: PetscLogFlops(2.0*n*n*n+1.0*n*n);
962: } else {
963: d8 = PetscPowReal(normAm(n_,Apowers[2],2,work,rand),1.0/8.0);
964: }
965: eta3 = PetscMax(d6,d8);
966: if (eta3<=theta[2] && !ell(n_,A,coeff[2],7,work,rand)) {
967: *m = 7;
968: goto done;
969: }
970: if (eta3<=theta[3] && !ell(n_,A,coeff[3],9,work,rand)) {
971: *m = 9;
972: goto done;
973: }
974: if (n<SMALLN) {
975: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[3],&n_,&szero,work,&n_));
976: d10 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/10.0);
977: PetscLogFlops(2.0*n*n*n+1.0*n*n);
978: } else {
979: d10 = PetscPowReal(normAm(n_,Apowers[1],5,work,rand),1.0/10.0);
980: }
981: eta4 = PetscMax(d8,d10);
982: eta5 = PetscMin(eta3,eta4);
983: *s = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(eta5/theta[4])/PetscLogReal(2.0)),0);
984: if (*s) {
985: Ascaled = work+3*n*n;
986: n2 = n_*n_;
987: PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,A,&one,Ascaled,&one));
988: sfactor = PetscPowRealInt(2.0,-(*s));
989: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&sfactor,Ascaled,&one));
990: PetscLogFlops(1.0*n*n);
991: } else Ascaled = A;
992: *s += ell(n_,Ascaled,coeff[4],13,work,rand);
993: done:
994: PetscRandomDestroy(&rand);
995: return(0);
996: }
998: /*
999: * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
1000: *
1001: * N. J. Higham, "The scaling and squaring method for the matrix exponential
1002: * revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
1003: */
1004: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham(FN fn,Mat A,Mat B)
1005: {
1006: #if defined(PETSC_MISSING_LAPACK_GESV) || defined(SLEPC_MISSING_LAPACK_LANGE)
1008: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GESV/LANGE - Lapack routines are unavailable");
1009: #else
1010: PetscErrorCode ierr;
1011: PetscBLASInt n_,n2,*ipiv,info,one=1;
1012: PetscInt n,m,j,s;
1013: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
1014: PetscScalar *Aa,*Ba,*Apowers[5],*Q,*P,*W,*work,*aux;
1015: const PetscScalar *c;
1016: const PetscScalar c3[4] = { 120, 60, 12, 1 };
1017: const PetscScalar c5[6] = { 30240, 15120, 3360, 420, 30, 1 };
1018: const PetscScalar c7[8] = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
1019: const PetscScalar c9[10] = { 17643225600, 8821612800, 2075673600, 302702400, 30270240,
1020: 2162160, 110880, 3960, 90, 1 };
1021: const PetscScalar c13[14] = { 64764752532480000, 32382376266240000, 7771770303897600,
1022: 1187353796428800, 129060195264000, 10559470521600,
1023: 670442572800, 33522128640, 1323241920,
1024: 40840800, 960960, 16380, 182, 1 };
1027: MatDenseGetArray(A,&Aa);
1028: MatDenseGetArray(B,&Ba);
1029: MatGetSize(A,&n,NULL);
1030: PetscBLASIntCast(n,&n_);
1031: n2 = n_*n_;
1032: PetscMalloc2(8*n*n,&work,n,&ipiv);
1034: /* Matrix powers */
1035: Apowers[0] = work; /* Apowers[0] = A */
1036: Apowers[1] = Apowers[0] + n*n; /* Apowers[1] = A^2 */
1037: Apowers[2] = Apowers[1] + n*n; /* Apowers[2] = A^4 */
1038: Apowers[3] = Apowers[2] + n*n; /* Apowers[3] = A^6 */
1039: Apowers[4] = Apowers[3] + n*n; /* Apowers[4] = A^8 */
1041: PetscMemcpy(Apowers[0],Aa,n2*sizeof(PetscScalar));
1042: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,Apowers[0],&n_,&szero,Apowers[1],&n_));
1043: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[1],&n_,&szero,Apowers[2],&n_));
1044: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[2],&n_,&szero,Apowers[3],&n_));
1045: PetscLogFlops(6.0*n*n*n);
1047: /* Compute scaling parameter and order of Pade approximant */
1048: expm_params(n,Apowers,&s,&m,Apowers[4]);
1050: if (s) { /* rescale */
1051: for (j=0;j<4;j++) {
1052: scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
1053: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&scale,Apowers[j],&one));
1054: }
1055: PetscLogFlops(4.0*n*n);
1056: }
1058: /* Evaluate the Pade approximant */
1059: switch (m) {
1060: case 3: c = c3; break;
1061: case 5: c = c5; break;
1062: case 7: c = c7; break;
1063: case 9: c = c9; break;
1064: case 13: c = c13; break;
1065: default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
1066: }
1067: P = Ba;
1068: Q = Apowers[4] + n*n;
1069: W = Q + n*n;
1070: switch (m) {
1071: case 3:
1072: case 5:
1073: case 7:
1074: case 9:
1075: if (m==9) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[3],&n_,&szero,Apowers[4],&n_));
1076: PetscMemzero(P,n2*sizeof(PetscScalar));
1077: PetscMemzero(Q,n2*sizeof(PetscScalar));
1078: for (j=0;j<n;j++) {
1079: P[j+j*n] = c[1];
1080: Q[j+j*n] = c[0];
1081: }
1082: for (j=m;j>=3;j-=2) {
1083: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j],Apowers[(j+1)/2-1],&one,P,&one));
1084: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j-1],Apowers[(j+1)/2-1],&one,Q,&one));
1085: PetscLogFlops(4.0*n*n);
1086: }
1087: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,P,&n_,&szero,W,&n_));
1088: PetscLogFlops(2.0*n*n*n);
1089: SWAP(P,W,aux);
1090: break;
1091: case 13:
1092: /* P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
1093: + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I) */
1094: PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,P,&one));
1095: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[13],P,&one));
1096: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[11],Apowers[2],&one,P,&one));
1097: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[9],Apowers[1],&one,P,&one));
1098: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,P,&n_,&szero,W,&n_));
1099: PetscLogFlops(5.0*n*n+2.0*n*n*n);
1100: PetscMemzero(P,n2*sizeof(PetscScalar));
1101: for (j=0;j<n;j++) P[j+j*n] = c[1];
1102: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[7],Apowers[3],&one,P,&one));
1103: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[5],Apowers[2],&one,P,&one));
1104: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[3],Apowers[1],&one,P,&one));
1105: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,P,&one,W,&one));
1106: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,W,&n_,&szero,P,&n_));
1107: PetscLogFlops(7.0*n*n+2.0*n*n*n);
1108: /* Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
1109: + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I */
1110: PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,Q,&one));
1111: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[12],Q,&one));
1112: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[10],Apowers[2],&one,Q,&one));
1113: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[8],Apowers[1],&one,Q,&one));
1114: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,Q,&n_,&szero,W,&n_));
1115: PetscLogFlops(5.0*n*n+2.0*n*n*n);
1116: PetscMemzero(Q,n2*sizeof(PetscScalar));
1117: for (j=0;j<n;j++) Q[j+j*n] = c[0];
1118: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[6],Apowers[3],&one,Q,&one));
1119: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[4],Apowers[2],&one,Q,&one));
1120: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[2],Apowers[1],&one,Q,&one));
1121: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,W,&one,Q,&one));
1122: PetscLogFlops(7.0*n*n);
1123: break;
1124: default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
1125: }
1126: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&smone,P,&one,Q,&one));
1127: PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n_,&n_,Q,&n_,ipiv,P,&n_,&info));
1128: SlepcCheckLapackInfo("gesv",info);
1129: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&stwo,P,&one));
1130: for (j=0;j<n;j++) P[j+j*n] += 1.0;
1131: PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n);
1133: /* Squaring */
1134: for (j=1;j<=s;j++) {
1135: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,P,&n_,P,&n_,&szero,W,&n_));
1136: SWAP(P,W,aux);
1137: }
1138: if (P!=Ba) { PetscMemcpy(Ba,P,n2*sizeof(PetscScalar)); }
1139: PetscLogFlops(2.0*n*n*n*s);
1141: PetscFree2(work,ipiv);
1142: MatDenseRestoreArray(A,&Aa);
1143: MatDenseRestoreArray(B,&Ba);
1144: return(0);
1145: #endif
1146: }
1148: PetscErrorCode FNView_Exp(FN fn,PetscViewer viewer)
1149: {
1151: PetscBool isascii;
1152: char str[50];
1153: const char *methodname[] = {
1154: "scaling & squaring, [m/m] Pade approximant (Higham)",
1155: "scaling & squaring, [6/6] Pade approximant",
1156: "scaling & squaring, subdiagonal Pade approximant"
1157: };
1158: const int nmeth=sizeof(methodname)/sizeof(methodname[0]);
1161: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii);
1162: if (isascii) {
1163: if (fn->beta==(PetscScalar)1.0) {
1164: if (fn->alpha==(PetscScalar)1.0) {
1165: PetscViewerASCIIPrintf(viewer," Exponential: exp(x)\n");
1166: } else {
1167: SlepcSNPrintfScalar(str,50,fn->alpha,PETSC_TRUE);
1168: PetscViewerASCIIPrintf(viewer," Exponential: exp(%s*x)\n",str);
1169: }
1170: } else {
1171: SlepcSNPrintfScalar(str,50,fn->beta,PETSC_TRUE);
1172: if (fn->alpha==(PetscScalar)1.0) {
1173: PetscViewerASCIIPrintf(viewer," Exponential: %s*exp(x)\n",str);
1174: } else {
1175: PetscViewerASCIIPrintf(viewer," Exponential: %s",str);
1176: PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1177: SlepcSNPrintfScalar(str,50,fn->alpha,PETSC_TRUE);
1178: PetscViewerASCIIPrintf(viewer,"*exp(%s*x)\n",str);
1179: PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1180: }
1181: }
1182: if (fn->method<nmeth) {
1183: PetscViewerASCIIPrintf(viewer," computing matrix functions with: %s\n",methodname[fn->method]);
1184: }
1185: }
1186: return(0);
1187: }
1189: PETSC_EXTERN PetscErrorCode FNCreate_Exp(FN fn)
1190: {
1192: fn->ops->evaluatefunction = FNEvaluateFunction_Exp;
1193: fn->ops->evaluatederivative = FNEvaluateDerivative_Exp;
1194: fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Exp_Higham;
1195: fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Exp_Pade;
1196: fn->ops->evaluatefunctionmat[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa;
1197: fn->ops->view = FNView_Exp;
1198: return(0);
1199: }