Context Navigation

wannier.c@ f70c2a

Visit:

Last change on this file since f70c2a was e08f45, checked in by Frederik Heber <heber@…>, 17 years ago

Merge branch 'ConcaveHull' of ../espack2 into ConcaveHull

Conflicts:

molecuilder/src/boundary.cpp
molecuilder/src/boundary.hpp
molecuilder/src/builder.cpp
molecuilder/src/linkedcell.cpp
molecuilder/src/linkedcell.hpp
molecuilder/src/vector.cpp
molecuilder/src/vector.hpp
util/src/NanoCreator.c

Basically, this resulted from a lot of conversions two from spaces to one tab, which is my standard indentation. The mess was caused by eclipse auto-indenting. And in espack2:ConcaveHull was the new stuff, so all from ConcaveHull was replaced in case of doubt.
Additionally, vector had ofstream << operator instead ostream << ...

Property mode set to 100755

File size: 99.2 KB

Line
1	/** \file wannier.c
2	* Maximally Localized Wannier Functions.
3	*
4	* Contains the on function that minimises the spread of all orbitals in one rush in a parallel
5	* Jacobi-Diagonalization implementation, ComputeMLWF(), and one routine CalculateSpread() to
6	* calculate the spread of a specific orbital, which may be useful in checking on the change of
7	* spread during other calculations. convertComplex() helps in typecasting fftw_complex to gsl_complex.
8	*
9	Project: ParallelCarParrinello
10	\author Frederik Heber
11	\date 2006
12
13	File: wannier.c
14	$Id: wannier.c,v 1.7 2007-10-12 15:50:38 heber Exp $
15	*/
16
17	#ifdef HAVE_CONFIG_H
18	#include <config.h>
19	#endif
20
21	#include <math.h>
22	#include <gsl/gsl_math.h>
23	#include <gsl/gsl_eigen.h>
24	#include <gsl/gsl_matrix.h>
25	#include <gsl/gsl_vector.h>
26	#include <gsl/gsl_complex.h>
27	#include <gsl/gsl_complex_math.h>
28	#include <gsl/gsl_sort_vector.h>
29	#include <gsl/gsl_heapsort.h>
30	#include <gsl/gsl_blas.h>
31	#include <string.h>
32
33	#include "data.h"
34	#include "density.h"
35	#include "errors.h"
36	#include "gramsch.h"
37	#include "helpers.h"
38	#include "init.h"
39	#include "myfft.h"
40	#include "mymath.h"
41	#include "output.h"
42	#include "perturbed.h"
43	#include "wannier.h"
44
45
46	#define max_operators NDIM*2 //!< number of chosen self-adjoint operators when evaluating the spread
47	#define type Occupied
48
49
50	/** Converts type fftw_complex to gsl_complex.
51	* \param a complex number
52	* \return b complex number
53	*/
54	gsl_complex convertComplex (fftw_complex a) {
55	return gsl_complex_rect(c_re(a),c_im(a));
56	}
57
58	/** "merry go round" implementation for parallel index ordering.
59	* Given two arrays, one for the upper/left matrix columns, one for the lower/right ones, one step of an index generation is
60	* performed which generates once each possible pairing.
61	* \param *top index array 1
62	* \param *bot index array 2
63	* \param m N/2, where N is the matrix row/column dimension
64	* \note taken from [Golub, Matrix computations, 1989, p451]
65	*/
66	void MerryGoRoundIndices(int top, int bot, int m)
67	{
68	int old_top, old_bot;
69	int k;
70	old_top = (int ) Malloc(sizeof(int)m, "music: old_top");
71	old_bot = (int ) Malloc(sizeof(int)m, "music: old_bot");
72	/* fprintf(stderr,"oldtop\t");
73	for (k=0;k<m;k++)
74	fprintf(stderr,"%i\t", top[k]);
75	fprintf(stderr,"\n");
76	fprintf(stderr,"oldbot\t");
77	for (k=0;k<m;k++)
78	fprintf(stderr,"%i\t", bot[k]);
79	fprintf(stderr,"\n");*/
80	// first copy arrays
81	for (k=0;k<m;k++) {
82	old_top[k] = top[k];
83	old_bot[k] = bot[k];
84	}
85	// then let the music play
86	for (k=0;k<m;k++) {
87	if (k==1)
88	top[k] = old_bot[0];
89	else if (k > 1)
90	top[k] = old_top[k-1];
91	if (k==m-1)
92	bot[k] = old_top[k];
93	else
94	bot[k] = old_bot[k+1];
95	}
96	/* fprintf(stderr,"top\t");
97	for (k=0;k<m;k++)
98	fprintf(stderr,"%i\t", top[k]);
99	fprintf(stderr,"\n");
100	fprintf(stderr,"bot\t");
101	for (k=0;k<m;k++)
102	fprintf(stderr,"%i\t", bot[k]);
103	fprintf(stderr,"\n");*/
104	// and finito
105	Free(old_top, "MerryGoRoundIndices: old_top");
106	Free(old_bot, "MerryGoRoundIndices: old_bot");
107	}
108
109	/** merry-go-round for matrix columns.
110	* The trick here is that we must be aware of multiple rotations per process, thus only some of the
111	* whole lot of local columns get sent/received, most of them are just shifted via exchanging the various
112	* pointers to the matrix columns within the local array.
113	* \param comm communicator for circulation
114	* \param *Aloc local array of columns
115	* \param Num entries per column
116	* \param max_rounds number of column pairs in \a *Around
117	* \param k offset for tag
118	* \param tagS0 MPI tag for sending left column
119	* \param tagS1 MPI tag for sending right column
120	* \param tagR0 MPI tag for receiving left column
121	* \param tagR1 MPI tag for receiving right column
122	*/
123	void MerryGoRoundColumns(MPI_Comm comm, double **Aloc, int Num, int max_rounds, int k, int tagS0, int tagS1, int tagR0, int tagR1) {
124	//double A_locS1, A_locS2; // local columns of A[k]
125	//double A_locR1, A_locR2; // local columns of A[k]
126	MPI_Request requestS0, requestS1, requestR0, requestR1;
127	MPI_Status status;
128	int ProcRank, ProcNum;
129	int l;
130	MPI_Comm_size (comm, &ProcNum);
131	MPI_Comm_rank (comm, &ProcRank);
132	double Abuffer1, Abuffer2; // mark the columns that are circulated
133
134	//fprintf(stderr,"shifting...");
135	if (ProcRank == 0) {
136	if (max_rounds > 1) {
137	// get last left column
138	Abuffer1 = Aloc[2*(max_rounds-1)]; // note down the free column
139	MPI_Isend(Abuffer1, Num, MPI_DOUBLE, ProcRank+1, WannierALTag+2*k, comm, &requestS0);
140	} else {
141	// get right column
142	Abuffer1 = Aloc[1]; // note down the free column
143	MPI_Isend(Abuffer1, Num, MPI_DOUBLE, ProcRank+1, tagS1+2*k, comm, &requestS0);
144	}
145
146	//fprintf(stderr,"...left columns...");
147	for(l=2*max_rounds-2;l>2;l-=2) // left columns become shifted one place to the right
148	Aloc[l] = Aloc[l-2];
149
150	if (max_rounds > 1) {
151	//fprintf(stderr,"...first right...");
152	Aloc[2] = Aloc[1]; // get first right column
153	}
154
155	//fprintf(stderr,"...right columns...");
156	for(l=1;l<2*max_rounds-1;l+=2) // right columns become shifted one place to the left
157	Aloc[l] = Aloc[l+2];
158
159	//fprintf(stderr,"...last right...");
160	Aloc[(2*max_rounds-1)] = Abuffer1;
161	MPI_Irecv(Abuffer1, Num, MPI_DOUBLE, ProcRank+1, WannierARTag+2*k, comm, &requestR1);
162
163	} else if (ProcRank == ProcNum-1) {
164	//fprintf(stderr,"...first right...");
165	// get first right column
166	Abuffer2 = Aloc[1]; // note down the free column
167	MPI_Isend(Abuffer2, Num, MPI_DOUBLE, ProcRank-1, WannierARTag+2*k, comm, &requestS1);
168
169	//fprintf(stderr,"...right columns...");
170	for(l=1;l<2*max_rounds-1;l+=2) // right columns become shifted one place to the left
171	Aloc[(l)] = Aloc[(l+2)];
172
173	//fprintf(stderr,"...last right...");
174	Aloc[(2max_rounds-1)] = Aloc[2(max_rounds-1)]; // Put last left into last right column
175
176	//fprintf(stderr,"...left columns...");
177	for(l=2*(max_rounds-1);l>0;l-=2) // left columns become shifted one place to the right
178	Aloc[(l)] = Aloc[(l-2)];
179
180	//fprintf(stderr,"...first left...");
181	// if (max_rounds > 1)
182	Aloc[0] = Abuffer2; // get first left column
183	MPI_Irecv(Abuffer2, Num, MPI_DOUBLE, ProcRank-1, WannierALTag+2*k, comm, &requestR0);
184
185	} else {
186	// get last left column
187	MPI_Isend(Aloc[2(max_rounds-1)], Num, MPI_DOUBLE, ProcRank+1, WannierALTag+2k, comm, &requestS0);
188	Abuffer1 = Aloc[2*(max_rounds-1)]; // note down the free column
189
190	//fprintf(stderr,"...first right...");
191	// get first right column
192	MPI_Isend(Aloc[1], Num, MPI_DOUBLE, ProcRank-1, WannierARTag+2*k, comm, &requestS1);
193	Abuffer2 = Aloc[1]; // note down the free column
194
195	//fprintf(stderr,"...left columns...");
196	for(l=2*(max_rounds-1);l>0;l-=2) // left columns become shifted one place to the right
197	Aloc[(l)] = Aloc[(l-2)];
198
199	//fprintf(stderr,"...right columns...");
200	for(l=1;l<2*max_rounds-1;l+=2) // right columns become shifted one place to the left
201	Aloc[(l)] = Aloc[(l+2)];
202
203	//fprintf(stderr,"...first left...");
204	Aloc[0] = Abuffer1; // get first left column
205	MPI_Irecv(Aloc[0], Num, MPI_DOUBLE, ProcRank-1, WannierALTag+2*k, comm, &requestR0);
206
207	//fprintf(stderr,"...last right...");
208	Aloc[(2*max_rounds-1)] = Abuffer2;
209	MPI_Irecv(Aloc[(2max_rounds-1)], Num, MPI_DOUBLE, ProcRank+1, WannierARTag+2k, comm, &requestR1);
210	}
211
212	//fprintf(stderr,"...waiting...");
213	if (ProcRank != ProcNum-1)
214	MPI_Wait(&requestS0, &status);
215	if (ProcRank != 0) // first left column
216	MPI_Wait(&requestR0, &status);
217	if (ProcRank != 0)
218	MPI_Wait(&requestS1, &status);
219	if (ProcRank != ProcNum-1)
220	MPI_Wait(&requestR1, &status);
221	//fprintf(stderr,"...done\n");
222	}
223
224	/** By testing of greatest common divisor of the matrix rows (\a AllocNum) finds suitable parallel cpu group.
225	* \param *P Problem at hand
226	* \param AllocNum number of rows in matrix
227	* \return address of MPI communicator
228	*/
229	#ifdef HAVE_INLINE
230	inline MPI_Comm * DetermineParallelGroupbyGCD (struct Problem *P, int AllocNum)
231	#else
232	MPI_Comm * DetermineParallelGroupbyGCD (struct Problem *P, int AllocNum)
233	#endif
234	{
235	MPI_Comm *comm = &P->Par.comm_ST;
236
237	//if (P->Call.out[ReadOut]) fprintf(stderr,"(%i) Comparing groups - AllocNum %i --- All %i\t Psi %i\t PsiT %i\n",P->Par.me, AllocNum, P->Par.Max_me_comm_ST, P->Par.Max_me_comm_ST_Psi, P->Par.Max_my_color_comm_ST_Psi);
238	//if (P->Call.out[ReadOut]) fprintf(stderr,"(%i) Jacobi diagonalization is done parallely by ", P->Par.me);
239	if (AllocNum % (P->Par.Max_me_comm_ST*2) == 0) { // all parallel
240	comm = &P->Par.comm_ST;
241	//if (P->Call.out[ReadOut]) fprintf(stderr,"all\n");
242	} else if (P->Par.Max_me_comm_ST_Psi >= P->Par.Max_my_color_comm_ST_Psi) { // always the bigger group comes first
243	if (AllocNum % (P->Par.Max_me_comm_ST_Psi*2) == 0) { // coefficients parallel
244	comm = &P->Par.comm_ST_Psi;
245	//if (P->Call.out[ReadOut]) fprintf(stderr,"Psi\n");
246	} else if (AllocNum % (P->Par.Max_my_color_comm_ST_Psi*2) == 0) { // Psis parallel
247	comm = &P->Par.comm_ST_PsiT;
248	//if (P->Call.out[ReadOut]) fprintf(stderr,"PsiT\n");
249	}
250	} else {
251	if (AllocNum % (P->Par.Max_my_color_comm_ST_Psi*2) == 0) { // Psis parallel
252	comm = &P->Par.comm_ST_PsiT;
253	//if (P->Call.out[ReadOut]) fprintf(stderr,"PsiT\n");
254	} else if (AllocNum % (P->Par.Max_me_comm_ST_Psi*2) == 0) { // coefficients parallel
255	comm = &P->Par.comm_ST_Psi;
256	//if (P->Call.out[ReadOut]) fprintf(stderr,"Psi\n");
257	}
258	}
259	return comm;
260	}
261
262	/** Allocates and fills Lookup table for sin/cos values at each grid node.
263	* \param ***cos_table pointer to two-dimensional lookup table for cosine values
264	* \param ***sin_table pointer to two-dimensional lookup table for sine values
265	* \param *N array with number of nodes per \a NDIM axis
266	*/
267	void CreateSinCosLookupTable(double *cos_table, double sin_table, int N)
268	{
269	int i, j;
270	double argument;
271	double cos_lookup, sin_lookup;
272
273	// create lookup table for sin/cos values
274	cos_lookup = (double *) Malloc(sizeof(double )NDIM, "ComputeMLWF: cos_lookup");
275	sin_lookup = (double *) Malloc(sizeof(double )NDIM, "ComputeMLWF: sin_lookup");
276	for (i=0;i<NDIM;i++) {
277	// allocate memory
278	cos_lookup[i] = (double ) Malloc(sizeof(double)N[i], "ComputeMLWF: cos_lookup");
279	sin_lookup[i] = (double ) Malloc(sizeof(double)N[i], "ComputeMLWF: sin_lookup");
280
281	// reset arrays
282	SetArrayToDouble0(cos_lookup[i],N[i]);
283	SetArrayToDouble0(sin_lookup[i],N[i]);
284
285	// create lookup values
286	for (j=0;j<N[i];j++) {
287	argument = 2PI/(double)N[i](double)j;
288	cos_lookup[i][j] = cos(argument);
289	sin_lookup[i][j] = sin(argument);
290	}
291	}
292	*cos_table = cos_lookup;
293	*sin_table = sin_lookup;
294	}
295
296	/** Frees memory allocated during CreateSinCosLookupTable().
297	* \param ***cos_lookup pointer to two-dimensional lookup table for cosine values
298	* \param ***sin_lookup pointer to two-dimensional lookup table for sine values
299	*/
300	void FreeSinCosLookupTable(double cos_lookup, double sin_lookup)
301	{
302	int i;
303	// free lookups
304	for (i=0;i<NDIM;i++) {
305	Free(cos_lookup[i], "FreeSinCosLookupTable: cos_lookup[i]");
306	Free(sin_lookup[i], "FreeSinCosLookupTable: sin_lookup[i]");
307	}
308	Free(cos_lookup, "FreeSinCosLookupTable: cos_lookup");
309	Free(sin_lookup, "FreeSinCosLookupTable: sin_lookup");
310	}
311
312	/** Fills the entries of the six variance matrices.
313	* These matrices are parallely diagonalized during Wannier Localization. They are calculated from the
314	* wave function and by diagonalization one obtains the unitary transformation with which the Psis are
315	* treated afterwards.
316	* \param *P Problem at hand
317	* \param AllocNum number of rows/columns
318	* \param **A pointer to variance matrices
319	* \sa ComputeMLWF() - master function.
320	*/
321	void FillHigherOrderRealMomentsMatrices(struct Problem P, int AllocNum, gsl_matrix *A)
322	{
323	struct Lattice *Lat = &P->Lat;
324	struct RunStruct *R = &P->R;
325	struct Psis *Psi = &Lat->Psi;
326	struct LatticeLevel *Lev0 = R->Lev0;
327	struct LatticeLevel *LevS = R->LevS;
328	struct Density *Dens0 = Lev0->Dens;
329	struct OnePsiElement OnePsiA, OnePsiB, *LOnePsiB;
330	struct fft_plan_3d *plan = Lat->plan;
331	fftw_complex *PsiC = Dens0->DensityCArray[ActualPsiDensity];
332	fftw_real PsiCR = (fftw_real )PsiC;
333	fftw_complex *work = Dens0->DensityCArray[Temp2Density];
334	fftw_real **HGcR = &Dens0->DensityArray[HGDensity]; // use HGDensity, 4x Gap..Density, TempDensity as a storage array
335	fftw_complex HGcRC = (fftw_complex)HGcR;
336	fftw_complex **HGcR2C = &Dens0->DensityCArray[HGcDensity]; // use HGcDensity, 4x Gap..Density, TempDensity as an array
337	fftw_real HGcR2 = (fftw_real)HGcR2C;
338	int ElementSize = (sizeof(fftw_complex) / sizeof(double)), RecvSource;
339	MPI_Status status;
340	fftw_complex LPsiDatA=NULL, LPsiDatB=NULL;
341	int n[NDIM],n0,i0,iS, Index;
342	int Num = Psi->NoOfPsis; // is number of occupied plus unoccupied states for rows
343	int N0 = LevS->Plan0.plan->local_nx;
344	int *N = LevS->Plan0.plan->N;
345	const int NUpx = LevS->NUp[0];
346	const int NUpy = LevS->NUp[1];
347	const int NUpz = LevS->NUp[2];
348	double argument, PsiAtNode;
349	int e,g,i,j,k,l,m,p,u;
350	double a_ij = 0, b_ij = 0, A_ij = 0, B_ij = 0;
351	double cos_lookup = NULL,sin_lookup = NULL;
352
353	if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 2\n",P->Par.me);
354
355	debug(P, "Creating Lookup Table");
356	CreateSinCosLookupTable(&cos_lookup, &sin_lookup, N);
357
358	debug(P, "Calculating each entry of variance matrices");
359	l=-1; // to access U matrix element (0..Num-1)
360	// fill the matrices
361	for (i=0; i < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; i++) { // go through all wave functions
362	OnePsiA = &Psi->AllPsiStatus[i]; // grab OnePsiA
363	if (OnePsiA->PsiType == type) { // drop all but occupied ones
364	l++; // increase l if it is non-extra wave function
365	if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
366	LPsiDatA=LevS->LPsi->LocalPsi[OnePsiA->MyLocalNo];
367	else
368	LPsiDatA = NULL; // otherwise processes won't enter second loop, though they're supposed to send coefficients!
369
370	//fprintf(stderr,"(%i),(%i,%i): fft'd, A[..] and B, back-fft'd acting on \\phi_A\n",P->Par.me,l,0);
371	if (LPsiDatA != NULL) {
372	CalculateOneDensityR(Lat, LevS, Dens0, LPsiDatA, Dens0->DensityArray[ActualDensity], R->FactorDensityR, 1);
373	// note: factor is not used when storing result in DensityCArray[ActualPsiDensity] in CalculateOneDensityR()!
374	for (n0=0;n0<N0;n0++)
375	for (n[1]=0;n[1]<N[1];n[1]++)
376	for (n[2]=0;n[2]<N[2];n[2]++) {
377	i0 = n[2]NUpz+N[2]NUpz(n[1]NUpy+N[1]NUpyn0*NUpx);
378	iS = n[2]+N[2](n[1]+N[1]n0);
379	n[0] = n0 + LevS->Plan0.plan->start_nx;
380	for (k=0;k<max_operators;k+=2) {
381	e = k/2;
382	argument = 2.PI/(double)(N[e])(double)(n[e]);
383	PsiAtNode = PsiCR[i0] /LevS->MaxN;
384	// check lookup
385	if (!l) // perform check on first wave function only
386	if ((fabs(cos(argument) - cos_lookup[e][n[e]]) > MYEPSILON) \|\| (fabs(sin(argument) - sin_lookup[e][n[e]]) > MYEPSILON)) {
387	Error(SomeError, "Lookup table does not match real value!");
388	}
389	HGcR[k][iS] = cos_lookup[e][n[e]] * PsiAtNode; /* Matrix Vector Mult */
390	HGcR2[k][iS] = cos_lookup[e][n[e]] * HGcR[k][iS]; /* Matrix Vector Mult */
391	HGcR[k+1][iS] = sin_lookup[e][n[e]] * PsiAtNode; /* Matrix Vector Mult */
392	HGcR2[k+1][iS] = sin_lookup[e][n[e]] * HGcR[k+1][iS]; /* Matrix Vector Mult */
393	}
394	}
395	for (u=0;u<max_operators;u++) {
396	fft_3d_real_to_complex(plan, LevS->LevelNo, FFTNF1, HGcRC[u], work);
397	fft_3d_real_to_complex(plan, LevS->LevelNo, FFTNF1, HGcR2C[u], work);
398	}
399	}
400	m = -1; // to access U matrix element (0..Num-1)
401	for (j=0; j < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; j++) { // go through all wave functions
402	OnePsiB = &Psi->AllPsiStatus[j]; // grab OnePsiB
403	if (OnePsiB->PsiType == type) { // drop all but occupied ones
404	m++; // increase m if it is non-extra wave function
405	if (OnePsiB->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
406	LOnePsiB = &Psi->LocalPsiStatus[OnePsiB->MyLocalNo];
407	else
408	LOnePsiB = NULL;
409	if (LOnePsiB == NULL) { // if it's not local ... receive it from respective process into TempPsi
410	RecvSource = OnePsiB->my_color_comm_ST_Psi;
411	MPI_Recv( LevS->LPsi->TempPsi, LevS->MaxG*ElementSize, MPI_DOUBLE, RecvSource, WannierTag2, P->Par.comm_ST_PsiT, &status );
412	LPsiDatB=LevS->LPsi->TempPsi;
413	} else { // .. otherwise send it to all other processes (Max_me... - 1)
414	for (p=0;p<P->Par.Max_me_comm_ST_PsiT;p++)
415	if (p != OnePsiB->my_color_comm_ST_Psi)
416	MPI_Send( LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo], LevS->MaxG*ElementSize, MPI_DOUBLE, p, WannierTag2, P->Par.comm_ST_PsiT);
417	LPsiDatB=LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo];
418	} // LPsiDatB is now set to the coefficients of OnePsi either stored or MPI_Received
419
420	for (u=0;u<max_operators;u++) {
421	a_ij = 0;
422	b_ij = 0;
423	if (LPsiDatA != NULL) { // calculate, reduce and send to all ...
424	//fprintf(stderr,"(%i),(%i,%i): A[%i]: multiplying with \\phi_B\n",P->Par.me, l,m,u);
425	g=0;
426	if (LevS->GArray[0].GSq == 0.0) {
427	Index = LevS->GArray[g].Index;
428	a_ij = (LPsiDatB[0].reHGcRC[u][Index].re + LPsiDatB[0].imHGcRC[u][Index].im);
429	b_ij = (LPsiDatB[0].reHGcR2C[u][Index].re + LPsiDatB[0].imHGcR2C[u][Index].im);
430	g++;
431	}
432	for (; g < LevS->MaxG; g++) {
433	Index = LevS->GArray[g].Index;
434	a_ij += 2(LPsiDatB[g].reHGcRC[u][Index].re + LPsiDatB[g].im*HGcRC[u][Index].im);
435	b_ij += 2(LPsiDatB[g].reHGcR2C[u][Index].re + LPsiDatB[g].im*HGcR2C[u][Index].im);
436	} // due to the symmetry the resulting matrix element is real and symmetric in (i,j) ! (complex multiplication simplifies ...)
437	// sum up elements from all coefficients sharing processes
438	MPI_Allreduce ( &a_ij, &A_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
439	MPI_Allreduce ( &b_ij, &B_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
440	a_ij = A_ij;
441	b_ij = B_ij;
442	// send element to all Psi-sharing who don't have l local (MPI_Send is a lot slower than AllReduce!)
443	MPI_Allreduce ( &a_ij, &A_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);
444	MPI_Allreduce ( &b_ij, &B_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);
445	} else { // receive ...
446	MPI_Allreduce ( &a_ij, &A_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);
447	MPI_Allreduce ( &b_ij, &B_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);
448	}
449	// ... and store
450	//fprintf(stderr,"(%i),(%i,%i): A[%i]: setting component (local: %lg, total: %lg)\n",P->Par.me, l,m,u,a_ij,A_ij);
451	//fprintf(stderr,"(%i),(%i,%i): B: adding upon component (local: %lg, total: %lg)\n",P->Par.me, l,m,b_ij,B_ij);
452	gsl_matrix_set(A[u], l, m, A_ij);
453	gsl_matrix_set(A[max_operators], l, m, B_ij + gsl_matrix_get(A[max_operators],l,m));
454	}
455	}
456	}
457	}
458	}
459	// reset extra entries
460	for (u=0;u<=max_operators;u++) {
461	for (i=Num;i<AllocNum;i++)
462	for (j=0;j<AllocNum;j++)
463	gsl_matrix_set(A[u], i,j, 0.);
464	for (i=Num;i<AllocNum;i++)
465	for (j=0;j<AllocNum;j++)
466	gsl_matrix_set(A[u], j,i, 0.);
467	}
468	FreeSinCosLookupTable(cos_lookup, sin_lookup);
469
470	/*// print A matrices for debug
471	if (P->Par.me == 0)
472	for (u=0;u<max_operators+1;u++) {
473	fprintf(stderr, "A[%i] = \n",u);
474	for (i=0;i<Num;i++) {
475	for (j=0;j<Num;j++)
476	fprintf(stderr, "%e\t",gsl_matrix_get(A[u],i,j));
477	fprintf(stderr, "\n");
478	}
479	}
480	*/
481	}
482
483	/** Calculates reciprocal second order moments of each PsiType#Occupied orbital.
484	* First order is zero due to wave function being real (symmetry condition).
485	* \param *P Problem at hand
486	*/
487	void CalculateSecondOrderReciprocalMoment(struct Problem *P)
488	{
489	struct Lattice *Lat = &P->Lat;
490	struct RunStruct *R = &P->R;
491	struct FileData *F = &P->Files;
492	struct Psis *Psi = &Lat->Psi;
493	struct LatticeLevel *LevS = R->LevS;
494	double result, Result;
495	fftw_complex *LPsiDatA=NULL;
496	struct OnePsiElement *OnePsiA;
497	int i,j,l,g;
498	char spin[12], suffix[18];
499
500	switch (Lat->Psi.PsiST) {
501	case SpinDouble:
502	strcpy(suffix,".recispread.csv");
503	strcpy(spin,"SpinDouble");
504	break;
505	case SpinUp:
506	strcpy(suffix,".recispread_up.csv");
507	strcpy(spin,"SpinUp");
508	break;
509	case SpinDown:
510	strcpy(suffix,".recispread_down.csv");
511	strcpy(spin,"SpinDown");
512	break;
513	}
514	if(P->Par.me_comm_ST == 0) {
515	if (P->Call.out[NormalOut]) fprintf(stderr,"(%i) Calculating reciprocal moments ...\n",P->Par.me);
516	if (R->LevSNo == Lat->MaxLevel-1) // open freshly if first level
517	OpenFile(P, &F->ReciSpreadFile, suffix, "w", P->Call.out[ReadOut]); // only open on starting level
518	else if (F->ReciSpreadFile == NULL) // re-op,18en if not first level and not opened yet (or closed from ParseWannierFile)
519	OpenFile(P, &F->ReciSpreadFile, suffix, "a", P->Call.out[ReadOut]); // only open on starting level
520	if (F->ReciSpreadFile == NULL) {
521	Error(SomeError,"ComputeMLWF: Error opening Reciprocal spread File!\n");
522	} else {
523	fprintf(F->ReciSpreadFile,"===== Reciprocal Spreads of type %s ==========================================================================\n", spin);
524	}
525	}
526
527	// integrate second order moment
528	for (l=0; l < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; l++) { // go through all wave functions
529	OnePsiA = &Psi->AllPsiStatus[l]; // grab OnePsiA
530	if (OnePsiA->PsiType == type) { // drop all but occupied ones
531	if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
532	LPsiDatA=LevS->LPsi->LocalPsi[OnePsiA->MyLocalNo];
533	else
534	LPsiDatA = NULL;
535
536	if (LPsiDatA != NULL) {
537	if (P->Par.me_comm_ST == 0)
538	fprintf(F->ReciSpreadFile,"Psi%d_Lev%d\t", Psi->AllPsiStatus[l].MyGlobalNo, R->LevSNo);
539	for (i=0;i<NDIM;i++) {
540	for (j=0;j<NDIM;j++) {
541	result = 0.;
542	g = 0;
543	if (LevS->GArray[0].GSq == 0.0) {
544	result += LevS->GArray[g].G[i]LevS->GArray[g].G[j](LPsiDatA[0].re * LPsiDatA[0].re);
545	g++;
546	}
547	for (;g<LevS->MaxG;g++)
548	result += LevS->GArray[g].G[i]LevS->GArray[g].G[j]2.(LPsiDatA[g].re LPsiDatA[g].re + LPsiDatA[g].im * LPsiDatA[g].im);
549	//result *= Lat->Volume/LevS->MaxG;
550	MPI_Allreduce ( &result, &Result, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
551	if (P->Par.me_comm_ST == 0)
552	fprintf(F->ReciSpreadFile,"%2.5lg ", Result);
553	}
554	}
555	if (P->Par.me_comm_ST == 0)
556	fprintf(F->ReciSpreadFile,"\n");
557	}
558	}
559	}
560	if(P->Par.me_comm_ST == 0) {
561	fprintf(F->ReciSpreadFile,"====================================================================================================================\n\n");
562	fflush(F->ReciSpreadFile);
563	}
564	}
565
566	/** Given the unitary matrix the transformation is performed on the Psis.
567	* \param *P Problem at hand
568	* \param *U gsl matrix containing the transformation matrix
569	* \param Num dimension parameter for the matrix, i.e. number of wave functions
570	*/
571	void UnitaryTransformationOnWavefunctions(struct Problem P, gsl_matrix U, int Num)
572	{
573	struct Lattice *Lat = &P->Lat;
574	struct RunStruct *R = &P->R;
575	struct Psis *Psi = &Lat->Psi;
576	struct LatticeLevel *LevS = R->LevS;
577	MPI_Status status;
578	struct OnePsiElement OnePsiB, OnePsiA, *LOnePsiB;
579	int ElementSize = (sizeof(fftw_complex) / sizeof(double)), RecvSource;
580	fftw_complex LPsiDatA=NULL, LPsiDatB=NULL;
581	int g,i,j,l,k,m,p;
582
583	//if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 6: Transformation of all wave functions according to U\n",P->Par.me);
584
585	Num = Psi->TypeStartIndex[type+1] - Psi->TypeStartIndex[type]; // recalc Num as we can only work with local Psis from here
586	fftw_complex *coeffs_buffer = Malloc(sizeof(fftw_complex )Num, "ComputeMLWF: *coeffs_buffer");
587
588	for (l=0;l<Num;l++) // allocate for each local wave function
589	coeffs_buffer[l] = LevS->LPsi->OldLocalPsi[l];
590
591	//if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 6: Transformation ...\n",P->Par.me);
592	l=-1; // to access U matrix element (0..Num-1)
593	k=-1; // to access the above swap coeffs_buffer (0..LocalNo-1)
594	for (i=0; i < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; i++) { // go through all wave functions
595	OnePsiA = &Psi->AllPsiStatus[i]; // grab OnePsiA
596	if (OnePsiA->PsiType == type) { // drop all but occupied ones
597	l++; // increase l if it is occupied wave function
598	if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) { // local?
599	k++; // increase k only if it is a local, non-extra orbital wave function
600	LPsiDatA = (fftw_complex *) coeffs_buffer[k]; // new coeffs first go to copy buffer, old ones must not be overwritten yet
601	SetArrayToDouble0((double )LPsiDatA, 2LevS->MaxG); // zero buffer part
602	} else
603	LPsiDatA = NULL; // otherwise processes won't enter second loop, though they're supposed to send coefficients!
604
605	m = -1; // to access U matrix element (0..Num-1)
606	for (j=0; j < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; j++) { // go through all wave functions
607	OnePsiB = &Psi->AllPsiStatus[j]; // grab OnePsiB
608	if (OnePsiB->PsiType == type) { // drop all but occupied ones
609	m++; // increase m if it is occupied wave function
610	if (OnePsiB->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
611	LOnePsiB = &Psi->LocalPsiStatus[OnePsiB->MyLocalNo];
612	else
613	LOnePsiB = NULL;
614	if (LOnePsiB == NULL) { // if it's not local ... receive it from respective process into TempPsi
615	RecvSource = OnePsiB->my_color_comm_ST_Psi;
616	MPI_Recv( LevS->LPsi->TempPsi, LevS->MaxG*ElementSize, MPI_DOUBLE, RecvSource, WannierTag2, P->Par.comm_ST_PsiT, &status );
617	LPsiDatB=LevS->LPsi->TempPsi;
618	} else { // .. otherwise send it to all other processes (Max_me... - 1)
619	for (p=0;p<P->Par.Max_me_comm_ST_PsiT;p++)
620	if (p != OnePsiB->my_color_comm_ST_Psi)
621	MPI_Send( LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo], LevS->MaxG*ElementSize, MPI_DOUBLE, p, WannierTag2, P->Par.comm_ST_PsiT);
622	LPsiDatB=LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo];
623	} // LPsiDatB is now set to the coefficients of OnePsi either stored or MPI_Received
624
625	if (LPsiDatA != NULL) {
626	double tmp = gsl_matrix_get(U,l,m);
627	g=0;
628	if (LevS->GArray[0].GSq == 0.0) {
629	LPsiDatA[g].re += LPsiDatB[g].re * tmp;
630	LPsiDatA[g].im += LPsiDatB[g].im * tmp;
631	g++;
632	}
633	for (; g < LevS->MaxG; g++) {
634	LPsiDatA[g].re += LPsiDatB[g].re * tmp;
635	LPsiDatA[g].im += LPsiDatB[g].im * tmp;
636	}
637	}
638	}
639	}
640	}
641	}
642
643	//if(P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) STEP 6: Swapping buffer mem\n",P->Par.me);
644	// now, as all wave functions are updated, swap the buffer
645	l = -1;
646	for (k=0;k<Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT;k++) { // go through each local occupied wave function
647	if (Psi->AllPsiStatus[k].PsiType == type && Psi->AllPsiStatus[k].my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) {
648	l++;
649	//if(P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) (k:%i,l:%i) LocalNo = (%i,%i)\t AllPsiNo = (%i,%i)\n", P->Par.me, k,l,Psi->LocalPsiStatus[l].MyLocalNo, Psi->LocalPsiStatus[l].MyGlobalNo, Psi->AllPsiStatus[k].MyLocalNo, Psi->AllPsiStatus[k].MyGlobalNo);
650	LPsiDatA = (fftw_complex *)coeffs_buffer[l];
651	LPsiDatB = LevS->LPsi->LocalPsi[l];
652	for (g=0;g<LevS->MaxG;g++) {
653	LPsiDatB[g].re = LPsiDatA[g].re;
654	LPsiDatB[g].im = LPsiDatA[g].im;
655	}
656	// recalculating non-local form factors which are coefficient dependent!
657	CalculateNonLocalEnergyNoRT(P, Psi->LocalPsiStatus[l].MyLocalNo);
658	}
659	}
660	// and free allocated buffer memory
661	Free(coeffs_buffer, "UnitaryTransformationOnWavefunctions: coeffs_buffer");
662	}
663
664	/** Changes Wannier Centres according to RunStruct#CommonWannier.
665	* \param *P Problem at hand.
666	*/
667	void ChangeWannierCentres(struct Problem *P)
668	{
669	struct RunStruct *R = &P->R;
670	struct Lattice *Lat = &P->Lat;
671	struct LatticeLevel *LevS = R->LevS;
672	struct Psis *Psi = &Lat->Psi;
673	struct OnePsiElement *OnePsiA;
674	int marker, *group;
675	int Num = Psi->NoOfPsis;
676	double **WannierCentre = NULL;
677	double *WannierSpread = NULL;
678	int partner[Num];
679	int i,j,l,k;
680	int totalflag, flag;
681	double q[NDIM], center[NDIM];
682	double Spread;
683	int *N = LevS->Plan0.plan->N;
684
685	int MPISignal = 0, mpisignal = 0;
686	int membersize, maxsize;
687	int msgsize;
688	double *buffer = NULL;
689	int position;
690	MPI_Status status;
691
692	// determine necessary pack/buffer sizes for the wannier stuff
693	MPI_Pack_size(3, MPI_DOUBLE, P->Par.comm_ST_PsiT, &membersize);
694	maxsize = membersize;
695	MPI_Pack_size(1, MPI_DOUBLE, P->Par.comm_ST_PsiT, &membersize);
696	maxsize += membersize;
697
698	// allocate memory
699	buffer = (double ) Malloc(sizeof(double)maxsize, "ChangeWannierCentres: *buffer");
700	WannierSpread = (double ) Malloc(sizeof(double)Num, "ChangeWannierCentres: *WannierSpread");
701	WannierCentre = (double *) Malloc(sizeof(double )Num, "ChangeWannierCentres: *WannierCentre");
702	for (j=0;j<Num;j++)
703	WannierCentre[j] = (double ) Malloc(sizeof(double)(NDIM+1), "ChangeWannierCentres: *WannierCentre[]");
704
705	// exchange wannier centers and spread among processes by going over all wave functions
706	i=-1;
707	for (l=0; l < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; l++) {
708	OnePsiA = &Psi->AllPsiStatus[l]; // grab OnePsiA
709	if (OnePsiA->PsiType == type) { // drop all but occupied ones
710	i++; // increase l if it is occupied wave function
711
712	// send and receive centres and spread (we use the MPI_Pack'ing scheme to preserve correct order of data)
713	if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) {// is this local?
714	position = 0;
715	MPI_Pack(Psi->AddData[OnePsiA->MyLocalNo].WannierCentre, 3, MPI_DOUBLE, buffer, maxsize, &position, P->Par.comm_ST_PsiT);
716	MPI_Pack(&Psi->AddData[OnePsiA->MyLocalNo].WannierSpread, 1, MPI_DOUBLE, buffer, maxsize, &position, P->Par.comm_ST_PsiT);
717	for (k=0;k<P->Par.Max_me_comm_ST_PsiT;k++)
718	if (k != P->Par.my_color_comm_ST_Psi) { // send to all other processes
719	fprintf(stderr, "(%i) Send goes from %d to %d\n", P->Par.me, k, P->Par.my_color_comm_ST_Psi);
720	if (MPI_Send(buffer, position, MPI_PACKED, k, WannierTag5, P->Par.comm_ST_PsiT) != MPI_SUCCESS) {
721	mpisignal = 1;
722	}
723	} else { // "send" to us
724	for(j=0;j<NDIM;j++) // copy to own array as well
725	WannierCentre[i][j] = Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[j];
726	WannierSpread[i] = Psi->AddData[OnePsiA->MyLocalNo].WannierSpread;
727	}
728	} else {
729	fprintf(stderr, "(%i) Receive is from %d to %d\n", P->Par.me, OnePsiA->my_color_comm_ST_Psi, P->Par.my_color_comm_ST_Psi);
730	if (MPI_Recv(buffer, maxsize, MPI_PACKED, OnePsiA->my_color_comm_ST_Psi, WannierTag5, P->Par.comm_ST_PsiT, &status) != MPI_SUCCESS)
731	mpisignal = 1;
732	position = 0;
733	MPI_Get_count(&status, MPI_PACKED, &msgsize);
734	MPI_Unpack (buffer, msgsize, &position, WannierCentre[i], 3, MPI_DOUBLE, P->Par.comm_ST_PsiT);
735	MPI_Unpack (buffer, msgsize, &position, &WannierSpread[i], 1, MPI_DOUBLE, P->Par.comm_ST_PsiT);
736	}
737
738	// check for MPI errors
739	if (MPI_Allreduce(&mpisignal, &MPISignal, 1, MPI_INT, MPI_SUM, P->Par.comm_ST_PsiT) != MPI_SUCCESS) // allreduce signals
740	Error(SomeError,"ChangeWannierCentres: Bcast of signal failed\n");
741	if (MPISignal) { // greater 0 means there was at least one hickup somewhere
742	Error(SomeError,"ChangeWannierCentres: Scattering of Wanniers failed\n");
743	}
744	}
745	}
746	// free the buffer memory
747	Free(buffer, "ChangeWannierCentres: *buffer");
748
749	if ((P->Call.out[NormalOut]) && (1)) { // root prints gathered wannier centres for debugging
750	fprintf(stderr, "(%i) Printing gathered Wannier centres:\n", P->Par.me);
751	for(l=0;l<Num;l++) {
752	fprintf(stderr, "(%i) ", P->Par.me);
753	for(j=0;j<NDIM;j++)
754	fprintf(stderr, "%lg\t", WannierCentre[l][j]);
755	fprintf(stderr, "-\t%lg\n", WannierSpread[l]);
756	}
757	}
758
759	// change centres
760	switch (R->CommonWannier) {
761	case 4:
762	debug(P,"Shifting each Wannier centers to cell center");
763	for (j=0;j<NDIM;j++) // center point in [0,1]^3
764	center[j] = 0.5;
765	RMat33Vec3(q,Lat->RealBasis, center); // transform to real coordinates
766	for (i=0; i < Num; i++) { // go through all occupied wave functions
767	for (j=0;j<NDIM;j++) // put into Wannier centres
768	WannierCentre[i][j] = q[j];
769	}
770	break;
771	case 3:
772	debug(P,"Shifting Wannier centers individually to nearest grid point");
773	for (i=0;i < Num; i++) { // go through all wave functions
774	RMat33Vec3(q, Lat->ReciBasis, WannierCentre[i]);
775	for (j=0;j<NDIM;j++) { // Recibasis is not true inverse but times 2.*PI
776	q[j] = (double)N[j]/(2.PI);
777
778	//fprintf(stderr,"(%i) N[%i]: %i\t tmp %e\t floor %e\t ceil %e\n",P->Par.me, j, N[j], tmp, floor(tmp), ceil(tmp));
779	if (fabs((double)floor(q[j]) - q[j]) < fabs((double)ceil(q[j]) - q[j]))
780	q[j] = floor(q[j])/(double)N[j];
781	else
782	q[j] = ceil(q[j])/(double)N[j];
783	}
784	RMat33Vec3(WannierCentre[i], Lat->RealBasis, q);
785	}
786	break;
787	case 2:
788	debug(P,"Combining individual orbitals according to spread.");
789	//fprintf(stderr,"(%i) Finding multiple bindings and Reweighting Wannier centres\n",P->Par.me);
790	debug(P,"finding partners");
791	marker = (int) Malloc(sizeof(int)(Num+1),"ComputeMLWF: marker");
792	group = (int*) Malloc(sizeof(int )*Num,"ComputeMLWF: group");
793	for (l=0;l<Num;l++) {
794	group[l] = (int) Malloc(sizeof(int)(Num+1),"ComputeMLWF: group[l]"); // each must group must have one more as end marker
795	for (k=0;k<=Num;k++)
796	group[l][k] = -1; // reset partner group
797	}
798	for (k=0;k<Num;k++)
799	partner[k] = 0;
800	debug(P,"mem allocated");
801	// go for each orbital through every other, check distance against the sum of both spreads
802	// if smaller add to group of this orbital
803	for (l=0;l<Num;l++) {
804	j=0; // index for partner group
805	for (k=0;k<Num;k++) { // check this against l
806	Spread = 0.;
807	for (i=0;i<NDIM;i++) {
808	//fprintf(stderr,"(%i) Spread += (%e - %e)^2 \n", P->Par.me, WannierCentre[l][i], WannierCentre[k][i]);
809	Spread += (WannierCentre[l][i] - WannierCentre[k][i])*(WannierCentre[l][i] - WannierCentre[k][i]);
810	}
811	//Spread = sqrt(Spread); // distance in Spread
812	//fprintf(stderr,"(%i) %i to %i: distance %e, SpreadSum = %e + %e = %e \n", P->Par.me, l, k, Spread, WannierSpread[l], WannierSpread[k], WannierSpread[l]+WannierSpread[k]);
813	if (Spread < 5(WannierSpread[l]+WannierSpread[k])(WannierSpread[l]+WannierSpread[k])) {// if distance smaller than sum of spread times two plus safety
814	group[l][j++] = k; // add k to group of l
815	partner[l]++;
816	//fprintf(stderr,"(%i) %i added as %i-th member to %i's group.\n", P->Par.me, k, j, l);
817	}
818	}
819	}
820
821	// consistency, for each orbital check if this orbital is also in the group of each referred orbital
822	debug(P,"checking consistency");
823	totalflag = 1;
824	for (l=0;l<Num;l++) // checking l's group
825	for (k=0;k<Num;k++) { // k is partner index
826	if (group[l][k] != -1) { // if current index k is a partner
827	flag = 0;
828	for(j=0;j<Num;j++) { // go through each entry in l partner's partner group if l exists
829	if ((group[ group[l][k] ][j] == l))
830	flag = 1;
831	}
832	//if (flag == 0) fprintf(stderr, "(%i) in %i's group %i is referred as a partner, but not the other way round!\n", P->Par.me, l, group[l][k]);
833	if (totalflag == 1) totalflag = flag;
834	}
835	}
836	// for each orbital group (marker group) weight each center to a total and put this into the local WannierCentres
837	debug(P,"weight and calculate new centers for partner groups");
838	for (l=0;l<=Num;l++)
839	marker[l] = 1;
840	if (totalflag) {
841	for (l=0;l<Num;l++) { // go through each orbital
842	if (marker[l] != 0) { // if it hasn't been reweighted
843	marker[l] = 0;
844	for (i=0;i<NDIM;i++)
845	q[i] = 0.;
846	j = 0;
847	while (group[l][j] != -1) {
848	marker[group[l][j]] = 0;
849	for (i=0;i<NDIM;i++) {
850	//fprintf(stderr,"(%i) Adding to %i's group, %i entry of %i: %e\n", P->Par.me, l, i, group[l][j], WannierCentre[ group[l][j] ][i]);
851	q[i] += WannierCentre[ group[l][j] ][i];
852	}
853	j++;
854	}
855	//fprintf(stderr,"(%i) %i's group: (%e,%e,%e)/%i = (%e,%e,%e)\n", P->Par.me, l, q[0], q[1], q[2], j, q[0]/(double)j, q[1]/(double)j, q[2]/(double)j);
856	for (i=0;i<NDIM;i++) {// weight by number of elements in partner group
857	q[i] /= (double)(j);
858	}
859
860	// put WannierCentre into own and all partners'
861	for (i=0;i<NDIM;i++)
862	WannierCentre[l][i] = q[i];
863	j = 0;
864	while (group[l][j] != -1) {
865	for (i=0;i<NDIM;i++)
866	WannierCentre[group[l][j]][i] = q[i];
867	j++;
868	}
869	}
870	}
871	}
872	if (P->Call.out[StepLeaderOut]) {
873	fprintf(stderr,"Summary:\n");
874	fprintf(stderr,"========\n");
875	for (i=0;i<Num;i++)
876	fprintf(stderr,"%i belongs to a %i-ple binding.\n",i,partner[i]);
877	}
878	debug(P,"done");
879
880	Free(marker, "ChangeWannierCentres: marker");
881	for (l=0;l<Num;l++)
882	Free(group[l], "ChangeWannierCentres: group[l]");
883	Free(group, "ChangeWannierCentres: group");
884	break;
885	case 1:
886	debug(P,"Individual orbitals are changed to center of all.");
887	for (i=0;i<NDIM;i++) // zero center of weight
888	q[i] = 0.;
889	for (k=0;k<Num;k++)
890	for (i=0;i<NDIM;i++) { // sum up all orbitals each component
891	q[i] += WannierCentre[k][i];
892	}
893	for (i=0;i<NDIM;i++) // divide by number
894	q[i] /= Num;
895	for (k=0;k<Num;k++)
896	for (i=0;i<NDIM;i++) { // put into this function's array
897	WannierCentre[k][i] = q[i];
898	}
899	break;
900	case 0:
901	default:
902	break;
903	}
904
905	if ((P->Call.out[NormalOut]) && (1)) { // root prints gathered wannier centres for debugging
906	fprintf(stderr, "(%i) Printing changed Wannier centres:\n", P->Par.me);
907	for(l=0;l<Num;l++) {
908	fprintf(stderr, "(%i) ", P->Par.me);
909	for(j=0;j<NDIM;j++)
910	fprintf(stderr, "%lg\t", WannierCentre[l][j]);
911	fprintf(stderr, "-\t%lg\n", WannierSpread[l]);
912	}
913	}
914
915	// store info into local arrays again
916	i=-1;
917	for (l=0; l < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; l++) { // go through all wave functions
918	OnePsiA = &Psi->AllPsiStatus[l]; // grab OnePsiA
919	if (OnePsiA->PsiType == type) { // drop all but occupied ones
920	i++; // increase l if it is occupied wave function
921	if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) {// is this local
922	for(j=0;j<NDIM;j++)
923	Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[j] = WannierCentre[i][j];
924	}
925	}
926	}
927
928	// show locally stored info for debugging
929	if (P->Call.out[NormalOut]) {
930	i=-1;
931	for (l=0; l < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; l++) { // go through all wave functions
932	OnePsiA = &Psi->AllPsiStatus[l]; // grab OnePsiA
933	if (OnePsiA->PsiType == type) { // drop all but occupied ones
934	i++; // increase l if it is occupied wave function
935	if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // is this local?
936	fprintf(stderr,"(%i) Psi %i, L %i: (x,y,z) = (%lg, %lg, %lg), Spread %lg\n",P->Par.me,i, R->LevSNo, Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[0], Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[1], Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[2], Psi->AddData[OnePsiA->MyLocalNo].WannierSpread);
937	}
938	}
939	}
940
941	// free memory
942	Free(WannierSpread, "ChangeWannierCentres: *WannierSpread");
943	for (j=0;j<Num;j++)
944	Free(WannierCentre[j], "ChangeWannierCentres: *WannierCentre[]");
945	Free(WannierCentre, "ChangeWannierCentres: **WannierCentre");
946	}
947
948	/** From the entries of the variance matrices the spread is calculated.
949	* WannierCentres are evaluated according to Resta operator: \f$\langle X \rangle = \frac{L}{2\pi} \sum_j {\cal I} \ln{\langle \Psi_j \| \exp{(i \frac{2\pi}{L} X)} \| \Psi_j \rangle}\f$
950	* WannierSpread is: \f$ \sum_j \langle \Psi_j \| r^2 \| \Psi_j \rangle - \langle \Psi_j \| r \| psi_j \rangle^2\f$
951	* \param *P Problem at hand
952	* \param *A variance matrices
953	* \param old_spread first term of wannier spread
954	* \param spread second term of wannier spread
955	* \param **WannierCentre 2D array (NDIM, \a Num) with wannier centres
956	* \param *WannierSpread array with wannier spread per wave function
957	*/
958	void ComputeWannierCentresfromVarianceMatrices(struct Problem P, gsl_matrix A, double spread, double old_spread, double WannierCentre, double WannierSpread)
959	{
960	struct Lattice *Lat = &P->Lat;
961	struct Psis *Psi = &Lat->Psi;
962	struct OnePsiElement *OnePsiA;
963	int i,j,k,l;
964	double tmp, q[NDIM];
965
966	*old_spread = 0;
967	*spread = 0;
968
969	// the spread for x,y,z resides in the respective diagonal element of A_.. for each orbital
970	i=-1;
971	for (l=0; l < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; l++) { // go through all wave functions
972	OnePsiA = &Psi->AllPsiStatus[l]; // grab OnePsiA
973	if (OnePsiA->PsiType == type) { // drop all but occupied ones
974	i++; // increase l if it is occupied wave function
975	//fprintf(stderr,"(%i) Wannier for %i\n", P->Par.me, i);
976
977	// calculate Wannier Centre
978	for (j=0;j<NDIM;j++) {
979	q[j] = 1./(2.PI) GSL_IMAG( gsl_complex_log( gsl_complex_rect(gsl_matrix_get(A[j2],i,i),gsl_matrix_get(A[j2+1],i,i))));
980	if (q[j] < 0) // change wrap around of above operator to smooth 0...Lat->RealBasisSQ
981	q[j] += 1.;
982	}
983	RMat33Vec3(WannierCentre[i], Lat->RealBasis, q);
984
985	// store orbital spread and centre in file
986	tmp = - pow(gsl_matrix_get(A[0],i,i),2) - pow(gsl_matrix_get(A[1],i,i),2)
987	- pow(gsl_matrix_get(A[2],i,i),2) - pow(gsl_matrix_get(A[3],i,i),2)
988	- pow(gsl_matrix_get(A[4],i,i),2) - pow(gsl_matrix_get(A[5],i,i),2);
989	WannierSpread[i] = gsl_matrix_get(A[max_operators],i,i) + tmp;
990	//fprintf(stderr,"(%i) WannierSpread[%i] = %e\n", P->Par.me, i, WannierSpread[i]);
991	//if (P->Par.me == 0) fprintf(F->SpreadFile,"Orbital %d:\t Wannier center (x,y,z)=(%lg,%lg,%lg)\t Spread sigma^2 = %lg - %lg = %lg\n",
992	//Psi->AllPsiStatus[i].MyGlobalNo, WannierCentre[i][0], WannierCentre[i][1], WannierCentre[i][2], gsl_matrix_get(A[max_operators],i,i), -tmp, WannierSpread[i]);
993	//if (P->Par.me == 0) fprintf(F->SpreadFile,"%e\t%e\t%e\n",
994	//WannierCentre[i][0], WannierCentre[i][1], WannierCentre[i][2]);
995
996	// gather all spreads
997	*old_spread += gsl_matrix_get(A[max_operators],i,i); // tr(U^H B U)
998	for (k=0;k<max_operators;k++)
999	*spread += pow(gsl_matrix_get(A[k],i,i),2);
1000	}
1001	}
1002	}
1003
1004	/** Prints gsl_matrix nicely to screen.
1005	* \param *P Problem at hand
1006	* \param *U gsl matrix
1007	* \param Num number of rows/columns
1008	* \param *msg name of matrix, prepended before entries
1009	*/
1010	void PrintGSLMatrix(struct Problem P, gsl_matrix U, int Num, const char *msg)
1011	{
1012	int k,l;
1013	fprintf(stderr,"(%i) %s = \n",P->Par.me, msg);
1014	for (k=0;k<Num;k++) {
1015	for (l=0;l<Num;l++)
1016	fprintf(stderr,"%e\t",gsl_matrix_get(U,l,k));
1017	fprintf(stderr,"\n");
1018	}
1019	}
1020
1021	/** Allocates memory for the DiagonalizationData structure
1022	* \param *P Problem at hand
1023	* \param DiagData pointer to structure
1024	* \param Num number of rows and columns in matrix
1025	* \param NumMatrices number of matrices to be simultaneously diagonalized
1026	* \param extra number of extra matrices to be passively also diagonalized
1027	*/
1028	void InitDiagonalization(struct Problem P, struct DiagonalizationData DiagData, int Num, int NumMatrices, int extra)
1029	{
1030	int i;
1031
1032	// store integer values
1033	//if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 1\n",P->Par.me);
1034	DiagData->Num = Num;
1035	DiagData->AllocNum = ceil((double)Num / 2. ) *2;
1036	DiagData->NumMatrices = NumMatrices;
1037	DiagData->extra = extra;
1038
1039	// determine communicator and our rank and its size
1040	DiagData->comm = DetermineParallelGroupbyGCD(P,DiagData->AllocNum);
1041	MPI_Comm_size (*(DiagData->comm), &(DiagData->ProcNum));
1042	MPI_Comm_rank (*(DiagData->comm), &(DiagData->ProcRank));
1043
1044	// allocate memory
1045	DiagData->U = gsl_matrix_calloc(DiagData->AllocNum,DiagData->AllocNum);
1046	gsl_matrix_set_identity(DiagData->U);
1047
1048	DiagData->A = (gsl_matrix *) Malloc((DiagData->NumMatrices+DiagData->extra) sizeof(gsl_matrix ), "InitDiagonalization: A");
1049	for (i=0;i<(DiagData->NumMatrices+DiagData->extra);i++) {
1050	DiagData->A[i] = gsl_matrix_calloc(DiagData->AllocNum,DiagData->AllocNum);
1051	gsl_matrix_set_zero(DiagData->A[i]);
1052	}
1053
1054	// init merry-go-round array for diagonilzation
1055	DiagData->top = (int ) Malloc(sizeof(int)DiagData->AllocNum/2, "InitDiagonalization: top");
1056	DiagData->bot = (int ) Malloc(sizeof(int)DiagData->AllocNum/2, "InitDiagonalization: bot");
1057	for (i=0;i<DiagData->AllocNum/2;i++) {
1058	DiagData->top[i] = 2*i;
1059	DiagData->bot[i] = 2*i+1;
1060	}
1061	/* // print starting values of index generation tables top and bot
1062	fprintf(stderr,"top\t");
1063	for (k=0;k<AllocNum/2;k++)
1064	fprintf(stderr,"%i\t", top[k]);
1065	fprintf(stderr,"\n");
1066	fprintf(stderr,"bot\t");
1067	for (k=0;k<AllocNum/2;k++)
1068	fprintf(stderr,"%i\t", bot[k]);
1069	fprintf(stderr,"\n");*/
1070	}
1071
1072	/** Frees allocated memory for the DiagonalizationData structure.
1073	* \param DiagData pointer to structure
1074	*/
1075	void FreeDiagonalization(struct DiagonalizationData *DiagData)
1076	{
1077	int i;
1078
1079	Free(DiagData->top, "FreeDiagonalization: DiagData->top");
1080	Free(DiagData->bot, "FreeDiagonalization: DiagData->bot");
1081	gsl_matrix_free(DiagData->U);
1082	for (i=0;i<(DiagData->NumMatrices+DiagData->extra);i++)
1083	gsl_matrix_free(DiagData->A[i]);
1084	Free(DiagData->A, "FreeDiagonalization: DiagData->A");
1085	}
1086
1087	/** Orthogonalizing PsiType#Occupied wave functions for MD.
1088	* Orthogonalizes wave functions by finding a unitary transformation such that the density remains unchanged.
1089	* To this end, the overlap matrix \f$\langle \Psi_i \| \Psi_j \rangle\f$ is created and diagonalised by Jacobi
1090	* transformation (product of rotation matrices). The found transformation matrix is applied to all Psis.
1091	* \param *P Problem at hand
1092	* \note [Payne] states that GramSchmidt is not well-suited. However, this Jacobi diagonalisation is not well
1093	* suited for our CG minimisation either. If one wave functions is changed in course of the minimsation
1094	* procedure, in a Gram-Schmidt-Orthogonalisation only this wave function is changed (although it remains under
1095	* debate whether subsequent Psis are still orthogonal to this changed wave function!), in a Jacobi-Orthogonali-
1096	* stion however at least two if not all wave functions are changed. Thus inhibits our fast way of updating the
1097	* density after a minimisation step -- UpdateDensityCalculation(). Thus, this routine can only be used for a
1098	* final orthogonalisation of all Psis, after the CG minimisation.
1099	*/
1100	void OrthogonalizePsis(struct Problem *P)
1101	{
1102	struct Lattice *Lat = &P->Lat;
1103	struct Psis *Psi = &Lat->Psi;
1104	struct LatticeLevel *LevS = P->R.LevS;
1105	int Num = Psi->NoOfPsis;
1106	int i,j,l;
1107	struct DiagonalizationData DiagData;
1108	double PsiSP;
1109
1110	InitDiagonalization(P, &DiagData, Num, 1, 0);
1111
1112	// Calculate Overlap matrix
1113	for (l=Psi->TypeStartIndex[Occupied]; l<Psi->TypeStartIndex[UnOccupied]; l++)
1114	CalculateOverlap(P, l, Occupied);
1115	for (i=0;i<Num;i++) // fill gsl matrix with overlap values
1116	for (j=0;j<Num;j++)
1117	gsl_matrix_set(DiagData.A[0],i,j,Psi->Overlap[i][j]);
1118	//if (P->Call.out[ReadOut]) PrintGSLMatrix(P,DiagData.A[0],Num,"<Psi_i\|Psi_J> (before)");
1119
1120	//if (P->Call.out[ReadOut]) PrintGSLMatrix(P,DiagData.U,Num,"Transformation (before)");
1121
1122	// diagonalize overlap matrix
1123	Diagonalize(P, &DiagData);
1124
1125	//if (P->Call.out[ReadOut]) PrintGSLMatrix(P,DiagData.U,Num,"Transformation (after)");
1126
1127	// apply found unitary transformation to wave functions
1128	UnitaryTransformationOnWavefunctions(P, DiagData.U, DiagData.AllocNum);
1129
1130	// update GramSch stati
1131	for (i=Psi->TypeStartIndex[Occupied];i<Psi->TypeStartIndex[UnOccupied];i++) {
1132	GramSchNormalize(P,LevS,LevS->LPsi->LocalPsi[i], 0.); // calculates square product if 0. is given...
1133	PsiSP = GramSchGetNorm2(P,LevS,LevS->LPsi->LocalPsi[i]);
1134	//if (P->Par.me_comm_ST_Psi == 0) fprintf(stderr,"(%i) PsiSP[%i] = %lg\n", P->Par.me, i, PsiSP);
1135	Psi->LocalPsiStatus[i].PsiGramSchStatus = (int)IsOrthonormal;
1136	}
1137	UpdateGramSchAllPsiStatus(P,Psi);
1138	/*
1139	for (l=Psi->TypeStartIndex[Occupied]; l<Psi->TypeStartIndex[UnOccupied]; l++)
1140	CalculateOverlap(P, l, Occupied);
1141	for (i=0;i<Num;i++) // fill gsl matrix with overlap values
1142	for (j=0;j<Num;j++)
1143	gsl_matrix_set(DiagData.A[0],i,j,Psi->Overlap[i][j]);
1144	if (P->Call.out[ReadOut]) PrintGSLMatrix(P,DiagData.A[0],Num,"<Psi_i\|Psi_J> (after)");
1145	*/
1146	// free memory
1147	FreeDiagonalization(&DiagData);
1148	}
1149
1150	/** Orthogonalizing PsiType#Occupied wave functions and their time derivatives for MD.
1151	* Ensures that two orthonormality condition are met for Psis:
1152	* \f$\langle \Psi_i \| \Psi_j \rangle = \delta_{ij}\f$
1153	* \f$\langle \dot{\Psi_i} \| \dot{\Psi_j} \rangle = \delta_{ij}\f$
1154	* \param *P Problem at hand
1155	* \note [Payne] states that GramSchmidt is not well-suited, and this stronger
1156	* condition is needed for numerical stability
1157	* \todo create and fill 1stoverlap with finite difference between Psi, OldPsi with R->Deltat
1158	*/
1159	void StrongOrthogonalizePsis(struct Problem *P)
1160	{
1161	struct Lattice *Lat = &P->Lat;
1162	struct Psis *Psi = &Lat->Psi;
1163	int Num = Psi->NoOfPsis;
1164	int i,j,l;
1165	struct DiagonalizationData DiagData;
1166
1167	InitDiagonalization(P, &DiagData, Num, 2, 0);
1168
1169	// Calculate Overlap matrix
1170	for (l=Psi->TypeStartIndex[Occupied]; l<Psi->TypeStartIndex[UnOccupied]; l++) {
1171	CalculateOverlap(P, l, Occupied);
1172	//Calculate1stOverlap(P, l, Occupied);
1173	}
1174	for (i=0;i<Num;i++) // fill gsl matrix with overlap values
1175	for (j=0;j<Num;j++) {
1176	gsl_matrix_set(DiagData.A[0],i,j,Psi->Overlap[i][j]);
1177	//gsl_matrix_set(DiagData.A[1],i,j,Psi->1stOverlap[i][j]);
1178	}
1179
1180	// diagonalize overlap matrix
1181	Diagonalize(P, &DiagData);
1182
1183	// apply found unitary transformation to wave functions
1184	UnitaryTransformationOnWavefunctions(P, DiagData.U, DiagData.AllocNum);
1185
1186	// free memory
1187	FreeDiagonalization(&DiagData);
1188	}
1189
1190	/** Uses either serial or parallel diagonalization.
1191	* \param *P Problem at hand
1192	* \param *DiagData pointer to structure DiagonalizationData containing necessary information for diagonalization
1193	* \sa ParallelDiagonalization(), SerialDiagonalization()
1194	*/
1195	void Diagonalize(struct Problem P, struct DiagonalizationData DiagData)
1196	{
1197	if (DiagData->Num != 1) { // one- or multi-process case?
1198	if (((DiagData->AllocNum % 2) == 0) && (DiagData->ProcNum != 1) && ((DiagData->AllocNum / 2) % DiagData->ProcNum == 0)) {
1199	/*
1200	debug(P,"Testing with silly matrix");
1201	ParallelDiagonalization(P, test, Utest, 1, 0, 4, Num, comm, ProcRank, ProcNum, top, bot);
1202	if(P->Call.out[ReadOut]) // && P->Par.me == 0)
1203	PrintGSLMatrix(P, test[0], 4, "test[0] (diagonalized)");
1204	if(P->Call.out[ReadOut]) // && P->Par.me == 0)
1205	PrintGSLMatrix(P, Utest, 4, "Utest (final)");
1206	*/
1207	debug(P,"ParallelDiagonalization");
1208	ParallelDiagonalization(P, DiagData);
1209	} else {/*
1210	debug(P,"Testing with silly matrix");
1211	SerialDiagonalization(P, test, Utest, 1, 0, 4, Num, top, bot);
1212	if(P->Call.out[ReadOut]) // && P->Par.me == 0)
1213	PrintGSLMatrix(P, test[0], 4, "test[0] (diagonalized)");
1214	if(P->Call.out[ReadOut]) // && P->Par.me == 0)
1215	PrintGSLMatrix(P, Utest, 4, "Utest (final)");
1216	*/
1217	debug(P,"SerialDiagonalization");
1218	SerialDiagonalization(P, DiagData);
1219	}
1220
1221	//if(P->Call.out[ReadOut]) // && P->Par.me == 0)
1222	//PrintGSLMatrix(P, DiagData->U, DiagData->AllocNum, "U");
1223	}
1224	}
1225
1226	/** Computation of Maximally Localized Wannier Functions.
1227	* Maximally localized functions are prime when evulating a Hamiltonian with
1228	* magnetic fields under periodic boundary conditions, as the common position
1229	* operator is no longer valid. These can be obtained by orbital rotations, which
1230	* are looked for iteratively and gathered in one transformation matrix, to be
1231	* later applied to the set of orbital wave functions.
1232	*
1233	* In order to obtain these, the following algorithm is applied:
1234	* -# Initialize U (identity) as the sought-for transformation matrix
1235	* -# Compute the real symmetric (due to Gamma point symmetry!) matrix elements
1236	* \f$A^{(k)}_{ij} = \langle \phi_i \| A^{(k)} \| \phi_j \rangle\f$ for the six operators
1237	* \f$A^{(k)}\f$
1238	* -# For each pair of indices (i,j) (i<j) do the following:
1239	* -# Compute the 2x2 matrix \f$G = \Re \Bigl ( \sum_k h^H(A^{(k)}) h(A^{(k)}) \Bigr)\f$
1240	* where \f$h(A) = [a_{ii} - a_{jj}, a_{ij} + a_{ji}]\f$
1241	* -# Obtain eigenvalues and eigenvectors of G. Set \f$[x,y]^T\f$ to the eigenvector of G
1242	* corresponding to the greatest eigenvalue, such that \f$x\geq0\f$
1243	* -# Compute the rotation matrix R elements (ii,ij,ji,jj) \f$[c,s,-s,c]\f$ different from the
1244	* identity matrix by \f$r=\sqrt{x^2+y^2}\f$, \f$c = \sqrt{\frac{x+r}{2r}}\f$
1245	* \f$s=\frac{y}{\sqrt{2r(x+r)}}\f$
1246	* -# Perform the similarity operation \f$A^{(k)} \rightarrow R A^{(k)} R\f$
1247	* -# Gather the rotations in \f$U = U R\f$
1248	* -# Compute the total spread \f$\sigma^2_{A^{(k)}}\f$
1249	* -# Compare the change in spread to a desired minimum RunStruct#EpsWannier, if still greater go to step 3.
1250	* -# Apply transformations to the orbital wavefunctions \f$ \| \phi_i \rangle = \sum_j U_{ij} \| \phi_j \rangle\f$
1251	* -# Compute the position of the Wannier centers from diagonal elements of \f$A^{(k)}\f$, store in
1252	* OnePsiElementAddData#WannierCentre
1253	*
1254	* Afterwards, the routine applies the found unitary rotation to the unperturbed group of wave functions.
1255	* Note that hereby additional memory is needed as old and transformed wave functions must be present at the same
1256	* time.
1257	*
1258	* The routine uses parallelization if possible. A parallel Jacobi-Diagonalization is implemented using the index
1259	* generation in music() and shift-columns() such that the evaluated position operator eigenvalue matrices
1260	* may be diagonalized simultaneously and parallely. We use the implementation explained in
1261	* [Golub, Matrix computations, 1989, p451].
1262	*
1263	* \param *P Problem at hand
1264	*/
1265	void ComputeMLWF(struct Problem *P) {
1266	// variables and allocation
1267	struct Lattice *Lat = &P->Lat;
1268	struct Psis *Psi = &Lat->Psi;
1269	int i;
1270	int Num = Psi->NoOfPsis; // is number of occupied plus unoccupied states for rows
1271	double **WannierCentre;
1272	double *WannierSpread;
1273	double spread, spreadSQ;
1274	struct DiagonalizationData DiagData;
1275
1276	if(P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) Beginning localization of orbitals ...\n",P->Par.me);
1277
1278	InitDiagonalization(P, &DiagData, Num, max_operators, 1);
1279
1280	// STEP 2: Calculate A[k]_ij = V/N \sum_{G1,G2} C^\ast_{l,G1} c_{m,G2} \sum_R A^{(k)}(R) exp(iR(G2-G1))
1281	//debug (P,"Calculatung Variance matrices");
1282	FillHigherOrderRealMomentsMatrices(P, DiagData.AllocNum, DiagData.A);
1283
1284	//debug (P,"Diagonalizing");
1285	Diagonalize(P, &DiagData);
1286
1287	// STEP 6: apply transformation U to all wave functions \sum_i^Num U_ji \| \phi_i \rangle = \| \phi_j^\ast \rangle
1288	//debug (P,"Performing Unitary transformation");
1289	UnitaryTransformationOnWavefunctions(P, DiagData.U, Num);
1290
1291	if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 7: Compute centres and spread printout\n",P->Par.me);
1292	WannierCentre = (double *) Malloc(sizeof(double )Num, "ComputeMLWF: WannierCentre");
1293	WannierSpread = (double ) Malloc(sizeof(double)Num, "ComputeMLWF: WannierSpread");
1294	for (i=0;i<Num;i++)
1295	WannierCentre[i] = (double ) Malloc(sizeof(double)NDIM, "ComputeMLWF: WannierCentre");
1296
1297	//debug (P,"Computing Wannier centres from variance matrix");
1298	ComputeWannierCentresfromVarianceMatrices(P, DiagData.A, &spread, &spreadSQ, WannierCentre, WannierSpread);
1299
1300	// write to file
1301	//debug (P,"Writing spread file");
1302	WriteWannierFile(P, spread, spreadSQ, WannierCentre, WannierSpread);
1303
1304	debug(P,"Free'ing memory");
1305	// free all remaining memory
1306	FreeDiagonalization(&DiagData);
1307	for (i=0;i<Num;i++)
1308	Free(WannierCentre[i], "ComputeMLWF: WannierCentre[i]");
1309	Free(WannierCentre, "ComputeMLWF: WannierCentre");
1310	Free(WannierSpread, "ComputeMLWF: WannierSpread");
1311	}
1312
1313	/** Solves directly for eigenvectors and eigenvalues of a real 2x2 matrix.
1314	* The eigenvalues are determined by \f$\det{(A-\lambda \cdot I)}\f$, where \f$I\f$ is the unit matrix and
1315	* \f$\lambda\f$ an eigenvalue. This leads to a pq-formula which is easily evaluated in the first part.
1316	*
1317	* The eigenvectors are then obtained by solving \f$A-\lambda \cdot I)x = 0\f$ for a given eigenvalue
1318	* \f$\lambda\f$. However, the eigenvector magnitudes are not specified, thus we the equation system is
1319	* still lacking such an equation. We use this fact to set either coordinate arbitrarily to 1 and then
1320	* derive the other by solving the equation system. Finally, the eigenvector is normalized.
1321	* \param *P Problem at hand
1322	* \param *A matrix whose eigenvalues/-vectors are to be found
1323	* \param *eval vector with eigenvalues
1324	* \param *evec matrix with corresponding eigenvectors in columns
1325	*/
1326	#ifdef HAVE_INLINE
1327	inline void EigensolverFor22Matrix(struct Problem P, gsl_matrix A, gsl_vector eval, gsl_matrix evec)
1328	#else
1329	void EigensolverFor22Matrix(struct Problem P, gsl_matrix A, gsl_vector eval, gsl_matrix evec)
1330	#endif
1331	{
1332	double a11,a12,a21,a22;
1333	double ev[2], summand1, summand2, norm;
1334	int i;
1335	// find eigenvalues
1336	a11 = gsl_matrix_get(A,0,0);
1337	a12 = gsl_matrix_get(A,0,1);
1338	a21 = gsl_matrix_get(A,1,0);
1339	a22 = gsl_matrix_get(A,1,1);
1340	summand1 = (a11+a22)/2.;
1341	summand2 = sqrt(summand1summand1 + a12a21 - a11*a22);
1342	ev[0] = summand1 + summand2;
1343	ev[1] = summand1 - summand2;
1344	gsl_vector_set(eval, 0, ev[0]);
1345	gsl_vector_set(eval, 1, ev[1]);
1346	//fprintf (stderr,"(%i) ev1 %lg \t ev2 %lg\n", P->Par.me, ev1, ev2);
1347
1348	// find eigenvectors
1349	for(i=0;i<2;i++) {
1350	if (fabs(ev[i]) < MYEPSILON) {
1351	gsl_matrix_set(evec, 0,i, 0.);
1352	gsl_matrix_set(evec, 1,i, 0.);
1353	} else if (fabs(a22-ev[i]) > MYEPSILON) {
1354	norm = sqrt(11 + (a21a21/(a22-ev[i])/(a22-ev[i])));
1355	gsl_matrix_set(evec, 0,i, 1./norm);
1356	gsl_matrix_set(evec, 1,i, -(a21/(a22-ev[i]))/norm);
1357	//fprintf (stderr,"(%i) evec %i (%lg,%lg)", P->Par.me, i, -(a12/(a11-ev[i]))/norm, 1./norm);
1358	} else if (fabs(a12) > MYEPSILON) {
1359	norm = sqrt(11 + (a11-ev[i])(a11-ev[i])/(a12*a12));
1360	gsl_matrix_set(evec, 0,i, 1./norm);
1361	gsl_matrix_set(evec, 1,i, -((a11-ev[i])/a12)/norm);
1362	//fprintf (stderr,"(%i) evec %i (%lg,%lg)", P->Par.me, i, -(a12/(a11-ev[i]))/norm, 1./norm);
1363	} else {
1364	if (fabs(a11-ev[i]) > MYEPSILON) {
1365	norm = sqrt(11 + (a12a12/(a11-ev[i])/(a11-ev[i])));
1366	gsl_matrix_set(evec, 0,i, -(a12/(a11-ev[i]))/norm);
1367	gsl_matrix_set(evec, 1,i, 1./norm);
1368	//fprintf (stderr,"\t evec %i (%lg,%lg)\n", i, -(a12/(a11-ev[i]))/norm, 1./norm);
1369	} else if (fabs(a21) > MYEPSILON) {
1370	norm = sqrt(11 + (a22-ev[i])(a22-ev[i])/(a21*a21));
1371	gsl_matrix_set(evec, 0,i, -(a22-ev[i])/a21/norm);
1372	gsl_matrix_set(evec, 1,i, 1./norm);
1373	//fprintf (stderr,"\t evec %i (%lg,%lg)\n", i, -(a12/(a11-ev[i]))/norm, 1./norm);
1374	} else {
1375	//gsl_matrix_set(evec, 0,i, 0.);
1376	//gsl_matrix_set(evec, 1,i, 1.);
1377	fprintf (stderr,"\t evec %i undetermined\n", i);
1378	}
1379	//gsl_matrix_set(evec, 0,0, 1.);
1380	//gsl_matrix_set(evec, 1,0, 0.);
1381	//fprintf (stderr,"(%i) evec1 undetermined", P->Par.me);
1382	}
1383	}
1384	}
1385
1386	/** Calculates sine and cosine values for multiple matrix diagonalization.
1387	* \param *evec eigenvectors in columns
1388	* \param *eval corresponding eigenvalues
1389	* \param *c cosine to be returned
1390	* \param *s sine to be returned
1391	*/
1392	#ifdef HAVE_INLINE
1393	inline void CalculateRotationAnglesFromEigenvalues(gsl_matrix evec, gsl_vector eval, double c, double s)
1394	#else
1395	void CalculateRotationAnglesFromEigenvalues(gsl_matrix evec, gsl_vector eval, double c, double s)
1396	#endif
1397	{
1398	int index;
1399	double x,y,r;
1400
1401	index = gsl_vector_max_index (eval); // get biggest eigenvalue
1402	//fprintf(stderr,"\t1st: %lg\t2nd: %lg --- biggest: %i\n", gsl_vector_get(eval, 0), gsl_vector_get(eval, 1), index);
1403	x = gsl_matrix_get(evec, 0, index);
1404	y = gsl_matrix_get(evec, 1, index);
1405	if (x < 0) { // ensure x>=0 so that rotation angles remain smaller Pi/4
1406	y = -y;
1407	x = -x;
1408	}
1409	//z = gsl_matrix_get(evec, 2, index) * x/fabs(x);
1410	//fprintf(stderr,"\tx %lg\ty %lg\n", x,y);
1411
1412	//fprintf(stderr,"(%i),(%i,%i) STEP 3c\n",P->Par.me,i,j);
1413	// STEP 3c: calculate R = [[c,s^\ast],[-s,c^\ast]]
1414	r = sqrt(xx + yy); // + z*z);
1415	if (fabs(r) > MYEPSILON) {
1416	c = sqrt((x + r) / (2.r));
1417	s = y / sqrt(2.r(x+r)); //, -z / sqrt(2r*(x+r)));
1418	} else {
1419	*c = 1.;
1420	*s = 0.;
1421	}
1422	}
1423
1424	/*
1425	* \param **A matrices (pointer array with \a NumMatrices entries) to be diagonalized
1426	* \param *U transformation matrix set to unity matrix
1427	* \param NumMatrices number of matrices to be diagonalized simultaneously
1428	* \param extra number of additional matrices the rotation is applied to however which actively diagonalized (follow in \a **A)
1429	* \param AllocNum number of rows/columns in matrices
1430	* \param Num number of wave functions
1431	* \param ProcRank index in group for this cpu
1432	* \param ProcNum number of cpus in group
1433	* \param *top array with top row indices (merry-go-round)
1434	* \param *bot array with bottom row indices (merry-go-round)
1435	*/
1436
1437	/** Simultaneous diagonalization of matrices with multiple cpus.
1438	* \param *P Problem at hand
1439	* \param *DiagData pointer to structure DiagonalizationData containing necessary information for diagonalization
1440	* \note this is slower given one cpu only than SerialDiagonalization()
1441	*/
1442	void ParallelDiagonalization(struct Problem P, struct DiagonalizationData DiagData)
1443	{
1444	struct RunStruct *R =&P->R;
1445	int tagR0, tagR1, tagS0, tagS1;
1446	int iloc, jloc;
1447	double s_all, c_all;
1448	int round, max_rounds;
1449	int start;
1450	int rcounts, rdispls;
1451	double c, s;
1452	int Lsend, Rsend, Lrecv, Rrecv; // where left(right) column is sent to or where it originates from
1453	int left, right; // left or right neighbour for process
1454	double spread = 0., old_spread=0., Spread=0.;
1455	int i,j,k,l,m,u;
1456	int set;
1457	int it_steps; // iteration step counter
1458	double Aloc[DiagData->NumMatrices+1], Uloc; // local columns for one step of A[k]
1459	double Around[DiagData->NumMatrices+1], Uround; // all local columns for one round of A[k]
1460	double Atotal[DiagData->NumMatrices+1], Utotal; // all local columns for one round of A[k]
1461	double a_i, a_j;
1462	gsl_matrix *G;
1463	gsl_vector *h;
1464	gsl_vector *eval;
1465	gsl_matrix *evec;
1466	//gsl_eigen_symmv_workspace *w;
1467
1468	max_rounds = (DiagData->AllocNum / 2)/DiagData->ProcNum; // each process must perform multiple rotations per step of a set
1469	if (P->Call.out[ReadOut]) fprintf(stderr,"(%i) start %i\tstep %i\tmax.rounds %i\n",P->Par.me, DiagData->ProcRank, DiagData->ProcNum, max_rounds);
1470
1471	// allocate column vectors for interchange of columns
1472	debug(P,"allocate column vectors for interchange of columns");
1473	c = (double ) Malloc(sizeof(double)max_rounds, "ComputeMLWF: c");
1474	s = (double ) Malloc(sizeof(double)max_rounds, "ComputeMLWF: s");
1475	c_all = (double ) Malloc(sizeof(double)DiagData->AllocNum/2, "ComputeMLWF: c_all");
1476	s_all = (double ) Malloc(sizeof(double)DiagData->AllocNum/2, "ComputeMLWF: s_all");
1477	rcounts = (int ) Malloc(sizeof(int)DiagData->ProcNum, "ComputeMLWF: rcounts");
1478	rdispls = (int ) Malloc(sizeof(int)DiagData->ProcNum, "ComputeMLWF: rdispls");
1479
1480	// allocate eigenvector stuff
1481	debug(P,"allocate eigenvector stuff");
1482	G = gsl_matrix_calloc (2,2);
1483	h = gsl_vector_alloc (2);
1484	eval = gsl_vector_alloc (2);
1485	evec = gsl_matrix_alloc (2,2);
1486	//w = gsl_eigen_symmv_alloc(2);
1487
1488	// establish communication partners
1489	debug(P,"establish communication partners");
1490	if (DiagData->ProcRank == 0) {
1491	tagS0 = WannierALTag; // left p0 always remains left p0
1492	} else {
1493	tagS0 = (DiagData->ProcRank == DiagData->ProcNum - 1) ? WannierARTag : WannierALTag; // left p_last becomes right p_last
1494	}
1495	tagS1 = (DiagData->ProcRank == 0) ? WannierALTag : WannierARTag; // right p0 always goes into left p1
1496	tagR0 = WannierALTag; //
1497	tagR1 = WannierARTag; // first process
1498	if (DiagData->ProcRank == 0) {
1499	left = DiagData->ProcNum-1;
1500	right = 1;
1501	Lsend = 0;
1502	Rsend = 1;
1503	Lrecv = 0;
1504	Rrecv = 1;
1505	} else if (DiagData->ProcRank == DiagData->ProcNum - 1) {
1506	left = DiagData->ProcRank - 1;
1507	right = 0;
1508	Lsend = DiagData->ProcRank;
1509	Rsend = DiagData->ProcRank - 1;
1510	Lrecv = DiagData->ProcRank - 1;
1511	Rrecv = DiagData->ProcRank;
1512	} else {
1513	left = DiagData->ProcRank - 1;
1514	right = DiagData->ProcRank + 1;
1515	Lsend = DiagData->ProcRank+1;
1516	Rsend = DiagData->ProcRank - 1;
1517	Lrecv = DiagData->ProcRank - 1;
1518	Rrecv = DiagData->ProcRank+1;
1519	}
1520	//if (P->Call.out[ReadOut]) fprintf(stderr,"(%i) left %i\t right %i --- Lsend %i\tRsend%i\tLrecv %i\tRrecv%i\n",P->Par.me, left, right, Lsend, Rsend, Lrecv, Rrecv);
1521
1522	// initialise A_loc
1523	debug(P,"initialise A_loc");
1524	for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {
1525	//Aloc[k] = (double ) Malloc(sizeof(double)AllocNum*2, "ComputeMLWF: Aloc[k]");
1526	Around[k] = (double ) Malloc(sizeof(double)DiagData->AllocNum2max_rounds, "ComputeMLWF: Around[k]");
1527	Atotal[k] = (double ) Malloc(sizeof(double)DiagData->AllocNum*DiagData->AllocNum, "ComputeMLWF: Atotal[k]");
1528	Aloc[k] = (double *) Malloc(sizeof(double )2max_rounds, "ComputeMLWF: Aloc[k]");
1529	//Around[k] = &Atotal[k][ProcRankAllocNum2*max_rounds];
1530
1531	for (round=0;round<max_rounds;round++) {
1532	Aloc[k][2round] = &Around[k][DiagData->AllocNum(2*round)];
1533	Aloc[k][2round+1] = &Around[k][DiagData->AllocNum(2*round+1)];
1534	for (l=0;l<DiagData->AllocNum;l++) {
1535	Aloc[k][2round][l] = gsl_matrix_get(DiagData->A[k],l,2(DiagData->ProcRank*max_rounds+round));
1536	Aloc[k][2round+1][l] = gsl_matrix_get(DiagData->A[k],l,2(DiagData->ProcRank*max_rounds+round)+1);
1537	//fprintf(stderr,"(%i) (%i, 0/1) A_loc1 %e\tA_loc2 %e\n",P->Par.me, l, Aloc[k][l],Aloc[k][l+AllocNum]);
1538	}
1539	}
1540	}
1541	// initialise U_loc
1542	debug(P,"initialise U_loc");
1543	//Uloc = (double ) Malloc(sizeof(double)AllocNum*2, "ComputeMLWF: Uloc");
1544	Uround = (double ) Malloc(sizeof(double)DiagData->AllocNum2max_rounds, "ComputeMLWF: Uround");
1545	Utotal = (double ) Malloc(sizeof(double)DiagData->AllocNum*DiagData->AllocNum, "ComputeMLWF: Utotal");
1546	Uloc = (double *) Malloc(sizeof(double )2max_rounds, "ComputeMLWF: Uloc");
1547	//Uround = &Utotal[ProcRankAllocNum2*max_rounds];
1548	for (round=0;round<max_rounds;round++) {
1549	Uloc[2round] = &Uround[DiagData->AllocNum(2*round)];
1550	Uloc[2round+1] = &Uround[DiagData->AllocNum(2*round+1)];
1551	for (l=0;l<DiagData->AllocNum;l++) {
1552	Uloc[2round][l] = gsl_matrix_get(DiagData->U,l,2(DiagData->ProcRank*max_rounds+round));
1553	Uloc[2round+1][l] = gsl_matrix_get(DiagData->U,l,2(DiagData->ProcRank*max_rounds+round)+1);
1554	//fprintf(stderr,"(%i) (%i, 0/1) U_loc1 %e\tU_loc2 %e\n",P->Par.me, l, Uloc[l+AllocNum0],Uloc[l+AllocNum1]);
1555	}
1556	}
1557
1558	// now comes the iteration loop
1559	debug(P,"now comes the iteration loop");
1560	it_steps = 0;
1561	do {
1562	it_steps++;
1563	//if (P->Par.me == 0) fprintf(stderr,"(%i) Beginning parallel iteration %i ... ",P->Par.me,it_steps);
1564	for (set=0; set < DiagData->AllocNum-1; set++) { // one column less due to column 0 staying at its place all the time
1565	//fprintf(stderr,"(%i) Beginning rotation set %i ...\n",P->Par.me,set);
1566	for (round = 0; round < max_rounds;round++) {
1567	start = DiagData->ProcRank * max_rounds + round;
1568	// get indices
1569	i = DiagData->top[start] < DiagData->bot[start] ? DiagData->top[start] : DiagData->bot[start]; // minimum of the two indices
1570	iloc = DiagData->top[start] < DiagData->bot[start] ? 0 : 1;
1571	j = DiagData->top[start] > DiagData->bot[start] ? DiagData->top[start] : DiagData->bot[start]; // maximum of the two indices: thus j > i
1572	jloc = DiagData->top[start] > DiagData->bot[start] ? 0 : 1;
1573	//fprintf(stderr,"(%i) my (%i,%i), loc(%i,%i)\n",P->Par.me, i,j, iloc, jloc);
1574
1575	// calculate rotation angle, i.e. c and s
1576	//fprintf(stderr,"(%i),(%i,%i) calculate rotation angle\n",P->Par.me,i,j);
1577	gsl_matrix_set_zero(G);
1578	for (k=0;k<DiagData->NumMatrices;k++) { // go through all operators ...
1579	// Calculate vector h(a) = [a_ii - a_jj, a_ij + a_ji, i(a_ji - a_ij)]
1580	//fprintf(stderr,"(%i) k%i [a_ii - a_jj] = %e - %e = %e\n",P->Par.me, k,Aloc[k][2round+iloc][i], Aloc[k][2round+jloc][j],Aloc[k][2round+iloc][i] - Aloc[k][2round+jloc][j]);
1581	//fprintf(stderr,"(%i) k%i [a_ij + a_ji] = %e + %e = %e\n",P->Par.me, k,Aloc[k][2round+jloc][i], Aloc[k][2round+iloc][j],Aloc[k][2round+jloc][i] + Aloc[k][2round+iloc][j]);
1582	gsl_vector_set(h, 0, Aloc[k][2round+iloc][i] - Aloc[k][2round+jloc][j]);
1583	gsl_vector_set(h, 1, Aloc[k][2round+jloc][i] + Aloc[k][2round+iloc][j]);
1584
1585	// Calculate G = Re[ \sum_k h^H (A^{(k)}) h(A^{(k)}) ]
1586	for (l=0;l<2;l++)
1587	for (m=0;m<2;m++)
1588	gsl_matrix_set(G,l,m, gsl_vector_get(h,l) * gsl_vector_get(h,m) + gsl_matrix_get(G,l,m));
1589	}
1590	//fprintf(stderr,"(%i),(%i,%i) STEP 3b\n",P->Par.me,i,j);
1591	// STEP 3b: retrieve eigenvector which belongs to greatest eigenvalue of G
1592	EigensolverFor22Matrix(P,G,eval,evec);
1593	//gsl_eigen_symmv(G, eval, evec, w); // calculates eigenvalues and eigenvectors of G
1594
1595	CalculateRotationAnglesFromEigenvalues(evec, eval, &c[round], &s[round]);
1596	//fprintf(stderr,"(%i),(%i,%i) COS %e\t SIN %e\n",P->Par.me,i,j,c[round],s[round]);
1597
1598	//fprintf(stderr,"(%i),(%i,%i) STEP 3e\n",P->Par.me,i,j);
1599	// V_loc = V_loc * V_small
1600	//debug(P,"apply rotation to local U");
1601	for (l=0;l<DiagData->AllocNum;l++) {
1602	a_i = Uloc[2*round+iloc][l];
1603	a_j = Uloc[2*round+jloc][l];
1604	Uloc[2round+iloc][l] = c[round] a_i + s[round] * a_j;
1605	Uloc[2round+jloc][l] = -s[round] a_i + c[round] * a_j;
1606	}
1607	} // end of round
1608	// circulate the rotation angles
1609	//debug(P,"circulate the rotation angles");
1610	MPI_Allgather(c, max_rounds, MPI_DOUBLE, c_all, max_rounds, MPI_DOUBLE, *(DiagData->comm)); // MPI_Allgather is waaaaay faster than ring circulation
1611	MPI_Allgather(s, max_rounds, MPI_DOUBLE, s_all, max_rounds, MPI_DOUBLE, *(DiagData->comm));
1612	//m = start;
1613	for (l=0;l<DiagData->AllocNum/2;l++) { // for each process
1614	// we have V_small from process k
1615	//debug(P,"Apply V_small from other process");
1616	i = DiagData->top[l] < DiagData->bot[l] ? DiagData->top[l] : DiagData->bot[l]; // minimum of the two indices
1617	j = DiagData->top[l] > DiagData->bot[l] ? DiagData->top[l] : DiagData->bot[l]; // maximum of the two indices: thus j > i
1618	iloc = DiagData->top[l] < DiagData->bot[l] ? 0 : 1;
1619	jloc = DiagData->top[l] > DiagData->bot[l] ? 0 : 1;
1620	for (m=0;m<max_rounds;m++) {
1621	//fprintf(stderr,"(%i) %i processes' (%i,%i)\n",P->Par.me, m,i,j);
1622	// apply row rotation to each A[k]
1623	for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {// one extra for B matrix !
1624	//fprintf(stderr,"(%i) A:(k%i) a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, i, Aloc[k][2m+iloc][i],j,Aloc[k][2m+iloc][j]);
1625	//fprintf(stderr,"(%i) A:(k%i) a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, i, Aloc[k][2m+jloc][i],j,Aloc[k][2m+jloc][j]);
1626	a_i = Aloc[k][2*m+iloc][i];
1627	a_j = Aloc[k][2*m+iloc][j];
1628	Aloc[k][2m+iloc][i] = c_all[l] a_i + s_all[l] * a_j;
1629	Aloc[k][2m+iloc][j] = -s_all[l] a_i + c_all[l] * a_j;
1630	a_i = Aloc[k][2*m+jloc][i];
1631	a_j = Aloc[k][2*m+jloc][j];
1632	Aloc[k][2m+jloc][i] = c_all[l] a_i + s_all[l] * a_j;
1633	Aloc[k][2m+jloc][j] = -s_all[l] a_i + c_all[l] * a_j;
1634	//fprintf(stderr,"(%i) A^%i: a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, i, Aloc[k][2m+iloc][i],j,Aloc[k][2m+iloc][j]);
1635	//fprintf(stderr,"(%i) A^%i: a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, i, Aloc[k][2m+jloc][i],j,Aloc[k][2m+jloc][j]);
1636	}
1637	}
1638	}
1639	// apply rotation to local operator matrices
1640	// A_loc = A_loc * V_small
1641	//debug(P,"apply rotation to local operator matrices A[k]");
1642	for (m=0;m<max_rounds;m++) {
1643	start = DiagData->ProcRank * max_rounds + m;
1644	iloc = DiagData->top[start] < DiagData->bot[start] ? 0 : 1;
1645	jloc = DiagData->top[start] > DiagData->bot[start] ? 0 : 1;
1646	for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {// extra for B matrix !
1647	for (l=0;l<DiagData->AllocNum;l++) {
1648	// Columns, i and j belong to this process only!
1649	a_i = Aloc[k][2*m+iloc][l];
1650	a_j = Aloc[k][2*m+jloc][l];
1651	Aloc[k][2m+iloc][l] = c[m] a_i + s[m] * a_j;
1652	Aloc[k][2m+jloc][l] = -s[m] a_i + c[m] * a_j;
1653	//fprintf(stderr,"(%i) A:(k%i) a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, l, Aloc[k][2m+iloc][l],l,Aloc[k][2m+jloc][l]);
1654	}
1655	}
1656	}
1657	// Shuffling of these round's columns to prepare next rotation set
1658	for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {// one extra for B matrix !
1659	// extract columns from A
1660	//debug(P,"extract columns from A");
1661	MerryGoRoundColumns(*(DiagData->comm), Aloc[k], DiagData->AllocNum, max_rounds, k, tagS0, tagS1, tagR0, tagR1);
1662
1663	}
1664	// and also for V ...
1665	//debug(P,"extract columns from U");
1666	MerryGoRoundColumns(*(DiagData->comm), Uloc, DiagData->AllocNum, max_rounds, 0, tagS0, tagS1, tagR0, tagR1);
1667
1668
1669	// and merry-go-round for the indices too
1670	//debug(P,"and merry-go-round for the indices too");
1671	MerryGoRoundIndices(DiagData->top, DiagData->bot, DiagData->AllocNum/2);
1672	}
1673
1674	//fprintf(stderr,"(%i) STEP 4\n",P->Par.me);
1675	// STEP 4: calculate new variance: \sum_{ik} (A^{(k)}_ii)^2
1676	old_spread = Spread;
1677	spread = 0.;
1678	for(k=0;k<DiagData->NumMatrices;k++) { // go through all self-adjoint operators
1679	for (i=0; i < 2*max_rounds; i++) { // go through all wave functions
1680	spread += Aloc[k][i][i+DiagData->ProcRank2max_rounds]Aloc[k][i][i+DiagData->ProcRank2*max_rounds];
1681	//spread += gsl_matrix_get(A[k],i,i)*gsl_matrix_get(A[k],i,i);
1682	}
1683	}
1684	MPI_Allreduce(&spread, &Spread, 1, MPI_DOUBLE, MPI_SUM, *(DiagData->comm));
1685	//Spread = spread;
1686	if (P->Par.me == 0) {
1687	//if(P->Call.out[ReadOut])
1688	// fprintf(stderr,"(%i) STEP 5: %2.9e - %2.9e <= %lg ?\n",P->Par.me,old_spread,Spread,R->EpsWannier);
1689	//else
1690	//fprintf(stderr,"%2.9e\n",Spread);
1691	}
1692	// STEP 5: check change of variance
1693	} while (fabs(old_spread-Spread) >= R->EpsWannier);
1694	// end of iterative diagonalization loop: We have found our final orthogonal U!
1695
1696	// gather local parts of U into complete matrix
1697	for (l=0;l<DiagData->ProcNum;l++)
1698	rcounts[l] = DiagData->AllocNum;
1699	debug(P,"allgather U");
1700	for (round=0;round<2*max_rounds;round++) {
1701	for (l=0;l<DiagData->ProcNum;l++)
1702	rdispls[l] = (l2max_rounds + round)*DiagData->AllocNum;
1703	MPI_Allgatherv(Uloc[round], DiagData->AllocNum, MPI_DOUBLE, Utotal, rcounts, rdispls, MPI_DOUBLE, *(DiagData->comm));
1704	}
1705	for (k=0;k<DiagData->AllocNum;k++) {
1706	for(l=0;l<DiagData->AllocNum;l++) {
1707	gsl_matrix_set(DiagData->U,k,l, Utotal[l+k*DiagData->AllocNum]);
1708	}
1709	}
1710
1711	// after one set, gather A[k] from all and calculate spread
1712	for (l=0;l<DiagData->ProcNum;l++)
1713	rcounts[l] = DiagData->AllocNum;
1714	debug(P,"gather A[k] for spread");
1715	for (u=0;u<DiagData->NumMatrices+DiagData->extra;u++) {// extra for B matrix !
1716	debug(P,"A[k] all gather");
1717	for (round=0;round<2*max_rounds;round++) {
1718	for (l=0;l<DiagData->ProcNum;l++)
1719	rdispls[l] = (l2max_rounds + round)*DiagData->AllocNum;
1720	MPI_Allgatherv(Aloc[u][round], DiagData->AllocNum, MPI_DOUBLE, Atotal[u], rcounts, rdispls, MPI_DOUBLE, *(DiagData->comm));
1721	}
1722	for (k=0;k<DiagData->AllocNum;k++) {
1723	for(l=0;l<DiagData->AllocNum;l++) {
1724	gsl_matrix_set(DiagData->A[u],k,l, Atotal[u][l+k*DiagData->AllocNum]);
1725	}
1726	}
1727	}
1728
1729	// free eigenvector stuff
1730	gsl_vector_free(h);
1731	gsl_matrix_free(G);
1732	//gsl_eigen_symmv_free(w);
1733	gsl_vector_free(eval);
1734	gsl_matrix_free(evec);
1735
1736	// Free column vectors
1737	for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {
1738	Free(Atotal[k], "ParallelDiagonalization: Atotal[k]");
1739	Free(Around[k], "ParallelDiagonalization: Around[k]");
1740	}
1741	Free(Uround, "ParallelDiagonalization: Uround");
1742	Free(Utotal, "ParallelDiagonalization: Utotal");
1743	Free(c_all, "ParallelDiagonalization: c_all");
1744	Free(s_all, "ParallelDiagonalization: s_all");
1745	Free(c, "ParallelDiagonalization: c");
1746	Free(s, "ParallelDiagonalization: s");
1747	Free(rcounts, "ParallelDiagonalization: rcounts");
1748	Free(rdispls, "ParallelDiagonalization: rdispls");
1749	}
1750	/*
1751	* \param **A matrices (pointer array with \a NumMatrices entries) to be diagonalized
1752	* \param *U transformation matrix set to unity matrix
1753	* \param NumMatrices number of matrices to be diagonalized
1754	* \param extra number of additional matrices the rotation is applied to however which actively diagonalized (follow in \a **A)
1755	* \param AllocNum number of rows/columns in matrices
1756	* \param Num number of wave functions
1757	* \param *top array with top row indices (merry-go-round)
1758	* \param *bot array with bottom row indices (merry-go-round)
1759	*/
1760
1761	/** Simultaneous Diagonalization of variances matrices with one cpu.
1762	* \param *P Problem at hand
1763	* \param *DiagData pointer to structure DiagonalizationData containing necessary information for diagonalization
1764	* \note this is faster given one cpu only than ParallelDiagonalization()
1765	*/
1766	void SerialDiagonalization(struct Problem P, struct DiagonalizationData DiagData)
1767	{
1768	struct RunStruct *R = &P->R;
1769	gsl_matrix *G;
1770	gsl_vector *h;
1771	gsl_vector *eval;
1772	gsl_matrix *evec;
1773	//gsl_eigen_symmv_workspace *w;
1774	double c,s,a_i,a_j;
1775	int it_steps, set, ProcRank;
1776	int i,j,k,l,m;
1777	double spread = 0., old_spread = 0.;\
1778
1779	// allocate eigenvector stuff
1780	debug(P,"allocate eigenvector stuff");
1781	G = gsl_matrix_calloc (2,2);
1782	h = gsl_vector_alloc (2);
1783	eval = gsl_vector_alloc (2);
1784	evec = gsl_matrix_alloc (2,2);
1785	//w = gsl_eigen_symmv_alloc(2);
1786
1787	c = (double *) Malloc(sizeof(double), "ComputeMLWF: c");
1788	s = (double *) Malloc(sizeof(double), "ComputeMLWF: s");
1789	debug(P,"now comes the iteration loop");
1790	it_steps=0;
1791	do {
1792	it_steps++;
1793	//if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 3: Iteratively maximize negative spread part\n",P->Par.me);
1794	//if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) Beginning iteration %i ... ",P->Par.me,it_steps);
1795	for (set=0; set < DiagData->AllocNum-1; set++) { // one column less due to column 0 stating at its place all the time
1796	//fprintf(stderr,"(%i) Beginning rotation set %i ...\n",P->Par.me,set);
1797	// STEP 3: for all index pairs 0<= i<j <AllocNum
1798	for (ProcRank=0;ProcRank<DiagData->AllocNum/2;ProcRank++) {
1799	// get indices
1800	i = DiagData->top[ProcRank] < DiagData->bot[ProcRank] ? DiagData->top[ProcRank] : DiagData->bot[ProcRank]; // minimum of the two indices
1801	j = DiagData->top[ProcRank] > DiagData->bot[ProcRank] ? DiagData->top[ProcRank] : DiagData->bot[ProcRank]; // maximum of the two indices: thus j > i
1802	//fprintf(stderr,"(%i),(%i,%i) STEP 3a\n",P->Par.me,i,j);
1803	// STEP 3a: Calculate G
1804	gsl_matrix_set_zero(G);
1805
1806	for (k=0;k<DiagData->NumMatrices;k++) { // go through all operators ...
1807	// Calculate vector h(a) = [a_ii - a_jj, a_ij + a_ji, i(a_ji - a_ij)]
1808	//fprintf(stderr,"(%i) k%i [a_ii - a_jj] = %e - %e = %e\n",P->Par.me, k,gsl_matrix_get(DiagData->A[k],i,i), gsl_matrix_get(DiagData->A[k],j,j),gsl_matrix_get(DiagData->A[k],i,i) - gsl_matrix_get(DiagData->A[k],j,j));
1809	//fprintf(stderr,"(%i) k%i [a_ij + a_ji] = %e + %e = %e\n",P->Par.me, k,gsl_matrix_get(DiagData->A[k],i,j), gsl_matrix_get(DiagData->A[k],j,i),gsl_matrix_get(DiagData->A[k],i,j) + gsl_matrix_get(DiagData->A[k],j,i));
1810	gsl_vector_set(h, 0, gsl_matrix_get(DiagData->A[k],i,i) - gsl_matrix_get(DiagData->A[k],j,j));
1811	gsl_vector_set(h, 1, gsl_matrix_get(DiagData->A[k],i,j) + gsl_matrix_get(DiagData->A[k],j,i));
1812	//gsl_vector_complex_set(h, 2, gsl_complex_mul_imag(gsl_complex_add(gsl_matrix_complex_get(A[k],j,i), gsl_matrix_complex_get(A[k],i,j)),1));
1813
1814	// Calculate G = Re[ \sum_k h^H (A^{(k)}) h(A^{(k)}) ]
1815	for (l=0;l<2;l++)
1816	for (m=0;m<2;m++)
1817	gsl_matrix_set(G,l,m, gsl_vector_get(h,l) * gsl_vector_get(h,m) + gsl_matrix_get(G,l,m));
1818
1819	//PrintGSLMatrix(P,G,2, "G");
1820	}
1821
1822	//fprintf(stderr,"(%i),(%i,%i) STEP 3b\n",P->Par.me,i,j);
1823	// STEP 3b: retrieve eigenvector which belongs to greatest eigenvalue of G
1824	EigensolverFor22Matrix(P,G,eval,evec);
1825	//gsl_eigen_symmv(G, eval, evec, w); // calculates eigenvalues and eigenvectors of G
1826
1827	//PrintGSLMatrix(P, evec, 2, "Eigenvectors");
1828	CalculateRotationAnglesFromEigenvalues(evec, eval, c, s);
1829	//fprintf(stderr,"(%i),(%i,%i) COS %e\t SIN %e\n",P->Par.me,i,j,c[0],s[0]);
1830
1831	//fprintf(stderr,"(%i),(%i,%i) STEP 3d\n",P->Par.me,i,j);
1832	// STEP 3d: apply rotation R to rows and columns of A^{(k)}
1833	for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {// one extra for B matrix !
1834	//PrintGSLMatrix(P,A[k],AllocNum, "A (before rot)");
1835	for (l=0;l<DiagData->AllocNum;l++) {
1836	// Rows
1837	a_i = gsl_matrix_get(DiagData->A[k],i,l);
1838	a_j = gsl_matrix_get(DiagData->A[k],j,l);
1839	gsl_matrix_set(DiagData->A[k], i, l, c[0] * a_i + s[0] * a_j);
1840	gsl_matrix_set(DiagData->A[k], j, l, -s[0] * a_i + c[0] * a_j);
1841	}
1842	for (l=0;l<DiagData->AllocNum;l++) {
1843	// Columns
1844	a_i = gsl_matrix_get(DiagData->A[k],l,i);
1845	a_j = gsl_matrix_get(DiagData->A[k],l,j);
1846	gsl_matrix_set(DiagData->A[k], l, i, c[0] * a_i + s[0] * a_j);
1847	gsl_matrix_set(DiagData->A[k], l, j, -s[0] * a_i + c[0] * a_j);
1848	}
1849	//PrintGSLMatrix(P,A[k],AllocNum, "A (after rot)");
1850	}
1851	//fprintf(stderr,"(%i),(%i,%i) STEP 3e\n",P->Par.me,i,j);
1852	//PrintGSLMatrix(P,DiagData->U,DiagData->AllocNum, "U (before rot)");
1853	// STEP 3e: apply U = R*U
1854	for (l=0;l<DiagData->AllocNum;l++) {
1855	a_i = gsl_matrix_get(DiagData->U,i,l);
1856	a_j = gsl_matrix_get(DiagData->U,j,l);
1857	gsl_matrix_set(DiagData->U, i, l, c[0] * a_i + s[0] * a_j);
1858	gsl_matrix_set(DiagData->U, j, l, -s[0] * a_i + c[0] * a_j);
1859	}
1860	//PrintGSLMatrix(P,DiagData->U,DiagData->AllocNum, "U (after rot)");
1861	}
1862	// and merry-go-round for the indices too
1863	//debug(P,"and merry-go-round for the indices too");
1864	if (DiagData->AllocNum > 2) MerryGoRoundIndices(DiagData->top, DiagData->bot, DiagData->AllocNum/2);
1865	}
1866
1867	//if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 4\n",P->Par.me);
1868	// STEP 4: calculate new variance: \sum_{ik} (A^{(k)}_ii)^2
1869	old_spread = spread;
1870	spread = 0.;
1871	for(k=0;k<DiagData->NumMatrices;k++) { // go through all self-adjoint operators
1872	for (i=0; i < DiagData->AllocNum; i++) { // go through all wave functions
1873	spread += pow(gsl_matrix_get(DiagData->A[k],i,i),2);
1874	}
1875	}
1876	if(P->Par.me == 0) {
1877	//if(P->Call.out[ReadOut])
1878	// fprintf(stderr,"(%i) STEP 5: %2.9e - %2.9e <= %lg ?\n",P->Par.me,old_spread,spread,R->EpsWannier);
1879	//else
1880	//fprintf(stderr,"%2.9e\n",spread);
1881	}
1882	// STEP 5: check change of variance
1883	} while (fabs(old_spread-spread) >= R->EpsWannier);
1884	// end of iterative diagonalization loop: We have found our final orthogonal U!
1885	Free(c, "SerialDiagonalization: c");
1886	Free(s, "SerialDiagonalization: s");
1887	// free eigenvector stuff
1888	gsl_vector_free(h);
1889	gsl_matrix_free(G);
1890	//gsl_eigen_symmv_free(w);
1891	gsl_vector_free(eval);
1892	gsl_matrix_free(evec);
1893	}
1894
1895	/** Writes the wannier centres and spread to file.
1896	* Also puts centres from array into OnePsiElementAddData structure.
1897	* \param *P Problem at hand
1898	* \param old_spread first term of wannier spread
1899	* \param spread second term of wannier spread
1900	* \param **WannierCentre 2D array (NDIM, \a Num) with wannier centres
1901	* \param *WannierSpread array with wannier spread per wave function
1902	*/
1903	void WriteWannierFile(struct Problem P, double spread, double old_spread, double WannierCentre, double WannierSpread)
1904	{
1905	struct FileData *F = &P->Files;
1906	struct RunStruct *R = &P->R;
1907	struct Lattice *Lat = &P->Lat;
1908	struct Psis *Psi = &Lat->Psi;
1909	struct OnePsiElement *OnePsiA;
1910	char spin[12], suffix[18];
1911	int i,j,l;
1912
1913	if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) Spread printout\n", P->Par.me);
1914
1915	switch (Lat->Psi.PsiST) {
1916	case SpinDouble:
1917	strcpy(suffix,".spread.csv");
1918	strcpy(spin,"SpinDouble");
1919	break;
1920	case SpinUp:
1921	strcpy(suffix,".spread_up.csv");
1922	strcpy(spin,"SpinUp");
1923	break;
1924	case SpinDown:
1925	strcpy(suffix,".spread_down.csv");
1926	strcpy(spin,"SpinDown");
1927	break;
1928	}
1929	if (P->Par.me_comm_ST == 0) {
1930	if (R->LevSNo == Lat->MaxLevel-1) // open freshly if first level
1931	OpenFile(P, &F->SpreadFile, suffix, "w", P->Call.out[ReadOut]); // only open on starting level
1932	else if (F->SpreadFile == NULL) // re-open if not first level and not opened yet (or closed from ParseWannierFile)
1933	OpenFile(P, &F->SpreadFile, suffix, "a", P->Call.out[ReadOut]); // only open on starting level
1934	if (F->SpreadFile == NULL) {
1935	Error(SomeError,"WriteWannierFile: Error opening Wannier File!\n");
1936	} else {
1937	fprintf(F->SpreadFile,"#===== W A N N I E R C E N T R E S for Level %d of type %s ========================\n", R->LevSNo, spin);
1938	fprintf(F->SpreadFile,"# Orbital+Level\tx\ty\tz\tSpread\n");
1939	}
1940	}
1941
1942	// put (new) WannierCentres into local ones and into file
1943	i=-1;
1944	for (l=0; l < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; l++) { // go through all wave functions
1945	OnePsiA = &Psi->AllPsiStatus[l]; // grab OnePsiA
1946	if (OnePsiA->PsiType == type) { // drop all but occupied ones
1947	i++; // increase l if it is occupied wave function
1948	if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) {// is this local?
1949	for (j=0;j<NDIM;j++)
1950	Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[j] = WannierCentre[i][j];
1951	Psi->AddData[OnePsiA->MyLocalNo].WannierSpread = WannierSpread[i];
1952	}
1953	if (P->Par.me_comm_ST == 0)
1954	fprintf(F->SpreadFile,"Psi%d_Lev%d\t%lg\t%lg\t%lg\t%lg\n", Psi->AllPsiStatus[i].MyGlobalNo, R->LevSNo, WannierCentre[i][0], WannierCentre[i][1], WannierCentre[i][2], WannierSpread[i]);
1955	}
1956	}
1957	if (P->Par.me_comm_ST == 0) {
1958	fprintf(F->SpreadFile,"\n#Matrix traces\tB_ii\tA_ii^2\tTotal (B_ii - A_ii^2)\n");
1959	fprintf(F->SpreadFile,"TotalSpread_L%d\t%lg\t%lg\t%lg\n\n",R->LevSNo, old_spread, spread, old_spread - spread);
1960	fflush(F->SpreadFile);
1961	}
1962	}
1963
1964
1965	/** Parses the spread file and puts values into OnePsiElementAddData#WannierCentre.
1966	* \param *P Problem at hand
1967	* \return 1 - success, 0 - failure
1968	*/
1969	int ParseWannierFile(struct Problem *P)
1970	{
1971	struct Lattice *Lat = &P->Lat;
1972	struct RunStruct *R = &P->R;
1973	struct Psis *Psi = &Lat->Psi;
1974	struct OnePsiElement *OnePsiA;
1975	int i,l,j, msglen;
1976	FILE *SpreadFile;
1977	char *tagname = NULL;
1978	char suffix[18];
1979	double WannierCentre[NDIM+1]; // combined centre and spread
1980	MPI_Status status;
1981	int errorsignal = 0; // 1 - ok, 0 - error
1982
1983	switch (Lat->Psi.PsiST) {
1984	case SpinDouble:
1985	strcpy(suffix,".spread.csv");
1986	break;
1987	case SpinUp:
1988	strcpy(suffix,".spread_up.csv");
1989	break;
1990	case SpinDown:
1991	strcpy(suffix,".spread_down.csv");
1992	break;
1993	}
1994	if (P->Call.out[NormalOut]) fprintf(stderr,"(%i) Parsing Wannier Centres from file ... \n", P->Par.me);
1995
1996	if (P->Par.me_comm_ST == 0) {
1997	tagname = (char ) Malloc(sizeof(char)MAXSTRINGSIZE, "ParseWannierFile: *tagname");
1998	if(!OpenFile(P, &SpreadFile, suffix, "r", P->Call.out[ReadOut])) { // check if file exists
1999	if (MPI_Bcast(&errorsignal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
2000	Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
2001	return 0;
2002	//Error(SomeError,"ParseWannierFile: Opening failed\n");
2003	}
2004	errorsignal = 1;
2005	if (MPI_Bcast(&errorsignal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
2006	Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
2007	} else {
2008	if (MPI_Bcast(&errorsignal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
2009	Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
2010	if (errorsignal == 0)
2011	return 0;
2012	}
2013	i=-1;
2014	for (l=0; l < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; l++) { // go through all wave functions
2015	OnePsiA = &Psi->AllPsiStatus[l]; // grab OnePsiA
2016	if (OnePsiA->PsiType == type) { // drop all but occupied ones
2017	i++; // increase l if it is occupied wave function
2018	if (P->Par.me_comm_ST == 0) { // only process 0 may access the spread file
2019	sprintf(tagname,"Psi%d_Lev%d",i,R->LevSNo);
2020	errorsignal = 0;
2021	if (!ParseForParameter(0,SpreadFile,tagname,0,3,1,row_double,WannierCentre,1,optional)) {
2022	//Error(SomeError,"ParseWannierFile: Parsing WannierCentre failed");
2023	if (MPI_Bcast(&errorsignal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
2024	Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
2025	return 0;
2026	}
2027	if (!ParseForParameter(0,SpreadFile,tagname,0,4,1,double_type,&WannierCentre[NDIM],1,optional)) {
2028	//Error(SomeError,"ParseWannierFile: Parsing WannierSpread failed");
2029	if (MPI_Bcast(&errorsignal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
2030	Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
2031	return 0;
2032	}
2033	errorsignal = 1;
2034	if (MPI_Bcast(&errorsignal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
2035	Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
2036	} else {
2037	if (MPI_Bcast(&errorsignal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
2038	Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
2039	if (errorsignal == 0)
2040	return 0;
2041	}
2042	if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) { // is this Psi local?
2043	if ((P->Par.me_comm_ST != 0) && (P->Par.me_comm_ST_Psi == 0)) { // if they don't belong to process 0 and we are a leader of a Psi group, receive 'em
2044	if (MPI_Recv(WannierCentre, NDIM+1, MPI_DOUBLE, 0, ParseWannierTag, P->Par.comm_ST_PsiT, &status) != MPI_SUCCESS)
2045	Error(SomeError,"ParseWannierFile: MPI_Recv of WannierCentre/Spread from process 0 failed");
2046	//return 0;
2047	MPI_Get_count(&status, MPI_DOUBLE, &msglen);
2048	if (msglen != NDIM+1)
2049	Error(SomeError,"ParseWannierFile: MPI_Recv of WannierCentre/Spread from process 0 failed due to wrong item count");
2050	//return 0;
2051	}
2052	if (MPI_Bcast(WannierCentre, NDIM+1, MPI_DOUBLE, 0, P->Par.comm_ST_Psi) != MPI_SUCCESS) // Bcast to all processes of the Psi group from leader
2053	Error(SomeError,"ParseWannierFile: MPI_Bcast of WannierCentre/Spread to sub process in Psi group failed");
2054	//return 0;
2055	// and store 'em (for all who have this Psi local)
2056	if (P->Call.out[NormalOut]) fprintf(stderr,"(%i) Psi %i, L %i: (x,y,z) = (%lg, %lg, %lg), Spread %lg\n",P->Par.me,i, R->LevSNo, WannierCentre[0], WannierCentre[1], WannierCentre[2], WannierCentre[NDIM]);
2057	for (j=0;j<NDIM;j++) Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[j] = WannierCentre[j];
2058	Psi->AddData[OnePsiA->MyLocalNo].WannierSpread = WannierCentre[NDIM];
2059	//if (P->Par.me == 0 && P->Call.out[ValueOut]) fprintf(stderr,"(%i) %s\t%lg\t%lg\t%lg\t\t%lg\n",P->Par.me, tagname,Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[0],Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[1],Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[2],Psi->AddData[OnePsiA->MyLocalNo].WannierSpread);
2060	} else if (P->Par.me_comm_ST == 0) { // if they are not local, yet we are process 0, send 'em to leader of its Psi group
2061	if (MPI_Send(WannierCentre, NDIM+1, MPI_DOUBLE, OnePsiA->my_color_comm_ST_Psi, ParseWannierTag, P->Par.comm_ST_PsiT) != MPI_SUCCESS)
2062	Error(SomeError,"ParseWannierFile: MPI_Send of WannierCentre/Spread to process 0 of owning Psi group failed");
2063	//return 0;
2064	}
2065	}
2066	}
2067	if ((SpreadFile != NULL) && (P->Par.me_comm_ST == 0)) {
2068	fclose(SpreadFile);
2069	Free(tagname, "ParseWannierFile: *tagname");
2070	}
2071	//fprintf(stderr,"(%i) Parsing Wannier files succeeded!\n", P->Par.me);
2072	return 1;
2073	}
2074
2075	/** Calculates the spread of orbital \a i.
2076	* Stored in OnePsiElementAddData#WannierSpread.
2077	* \param *P Problem at hand
2078	* \param i i-th wave function (note "extra" ones are not counted!)
2079	* \return spread \f$\sigma^2_{A^{(k)}}\f$
2080	*/
2081	double CalculateSpread(struct Problem *P, int i) {
2082	struct Lattice *Lat = &P->Lat;
2083	struct RunStruct *R = &P->R;
2084	struct Psis *Psi = &Lat->Psi;
2085	struct LatticeLevel *Lev0 = R->Lev0;
2086	struct LatticeLevel *LevS = R->LevS;
2087	struct Density *Dens0 = Lev0->Dens;
2088	struct fft_plan_3d *plan = Lat->plan;
2089	fftw_complex *PsiC = Dens0->DensityCArray[ActualPsiDensity];
2090	fftw_real PsiCR = (fftw_real )PsiC;
2091	fftw_complex *work = Dens0->DensityCArray[Temp2Density];
2092	fftw_real **HGcR = &Dens0->DensityArray[HGcDensity]; // use HGcDensity, 4x Gap..Density, TempDensity as a storage array
2093	fftw_complex HGcRC = (fftw_complex)HGcR;
2094	fftw_complex **HGcR2C = &Dens0->DensityCArray[HGcDensity]; // use HGcDensity, 4x Gap..Density, TempDensity as an array
2095	fftw_real HGcR2 = (fftw_real)HGcR2C;
2096	MPI_Status status;
2097	struct OnePsiElement OnePsiA, LOnePsiA;
2098	int ElementSize = (sizeof(fftw_complex) / sizeof(double)), RecvSource;
2099	fftw_complex *LPsiDatA=NULL;
2100	int k,n[NDIM],n0, *N,N0, g, p, iS, i0, Index;
2101	N0 = LevS->Plan0.plan->local_nx;
2102	N = LevS->Plan0.plan->N;
2103	const int NUpx = LevS->NUp[0];
2104	const int NUpy = LevS->NUp[1];
2105	const int NUpz = LevS->NUp[2];
2106	double a_ij, b_ij, A_ij, B_ij;
2107	double tmp, tmp2, spread = 0;
2108	double cos_lookup, sin_lookup;
2109
2110	b_ij = 0;
2111
2112	CreateSinCosLookupTable(&cos_lookup,&sin_lookup,N);
2113
2114	// fill matrices
2115	OnePsiA = &Psi->AllPsiStatus[i]; // grab the desired OnePsiA
2116	if (OnePsiA->PsiType != Extra) { // drop if extra one
2117	if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
2118	LOnePsiA = &Psi->LocalPsiStatus[OnePsiA->MyLocalNo];
2119	else
2120	LOnePsiA = NULL;
2121	if (LOnePsiA == NULL) { // if it's not local ... receive it from respective process into TempPsi
2122	RecvSource = OnePsiA->my_color_comm_ST_Psi;
2123	MPI_Recv( LevS->LPsi->TempPsi, LevS->MaxG*ElementSize, MPI_DOUBLE, RecvSource, WannierTag1, P->Par.comm_ST_PsiT, &status );
2124	LPsiDatA=LevS->LPsi->TempPsi;
2125	} else { // .. otherwise send it to all other processes (Max_me... - 1)
2126	for (p=0;p<P->Par.Max_me_comm_ST_PsiT;p++)
2127	if (p != OnePsiA->my_color_comm_ST_Psi)
2128	MPI_Send( LevS->LPsi->LocalPsi[OnePsiA->MyLocalNo], LevS->MaxG*ElementSize, MPI_DOUBLE, p, WannierTag1, P->Par.comm_ST_PsiT);
2129	LPsiDatA=LevS->LPsi->LocalPsi[OnePsiA->MyLocalNo];
2130	} // LPsiDatA is now set to the coefficients of OnePsi either stored or MPI_Received
2131
2132	CalculateOneDensityR(Lat, LevS, Dens0, LPsiDatA, Dens0->DensityArray[ActualDensity], R->FactorDensityR, 1);
2133	// note: factor is not used when storing result in DensityCArray[ActualPsiDensity] in CalculateOneDensityR()!
2134	for (n0=0;n0<N0;n0++)
2135	for (n[1]=0;n[1]<N[1];n[1]++)
2136	for (n[2]=0;n[2]<N[2];n[2]++) {
2137	i0 = n[2]NUpz+N[2]NUpz(n[1]NUpy+N[1]NUpyn0*NUpx);
2138	iS = n[2]+N[2](n[1]+N[1]n0);
2139	n[0] = n0 + LevS->Plan0.plan->start_nx;
2140	for (k=0;k<max_operators;k+=2) {
2141	tmp = 2PI/(double)(N[k/2])(double)(n[k/2]);
2142	tmp2 = PsiCR[i0] /LevS->MaxN;
2143	// check lookup
2144	if ((fabs(cos(tmp) - cos_lookup[k/2][n[k/2]]) > MYEPSILON) \|\| (fabs(sin(tmp) - sin_lookup[k/2][n[k/2]]) > MYEPSILON)) {
2145	fprintf(stderr,"(%i) (cos) %2.15e against (lookup) %2.15e,\t(sin) %2.15e against (lookup) %2.15e\n", P->Par.me, cos(tmp), cos_lookup[k/2][n[k/2]],sin(tmp),sin_lookup[k/2][n[k/2]]);
2146	Error(SomeError, "Lookup table does not match real value!");
2147	}
2148	// HGcR[k][iS] = cos_lookup[k/2][n[k/2]] * tmp2; /* Matrix Vector Mult */
2149	// HGcR2[k][iS] = cos_lookup[k/2][n[k/2]] * HGcR[k][iS]; /* Matrix Vector Mult */
2150	// HGcR[k+1][iS] = sin_lookup[k/2][n[k/2]] * tmp2; /* Matrix Vector Mult */
2151	// HGcR2[k+1][iS] = sin_lookup[k/2][n[k/2]] * HGcR[k+1][iS]; /* Matrix Vector Mult */
2152	HGcR[k][iS] = cos(tmp) * tmp2; /* Matrix Vector Mult */
2153	HGcR2[k][iS] = pow(cos(tmp),2) * tmp2; /* Matrix Vector Mult */
2154	HGcR[k+1][iS] = sin(tmp) * tmp2; /* Matrix Vector Mult */
2155	HGcR2[k+1][iS] = pow(sin(tmp),2) * tmp2; /* Matrix Vector Mult */
2156	}
2157	}
2158	for (k=0;k<max_operators;k++) {
2159	fft_3d_real_to_complex(plan, LevS->LevelNo, FFTNF1, HGcRC[k], work);
2160	fft_3d_real_to_complex(plan, LevS->LevelNo, FFTNF1, HGcR2C[k], work);
2161	}
2162
2163
2164	for (k=0;k<max_operators;k++) {
2165	a_ij = 0;
2166	//fprintf(stderr,"(%i),(%i,%i): A[%i]: multiplying with \\phi_B\n",P->Par.me, l,m,k);
2167	// sum directly in a_ij and b_ij the two desired terms
2168	g=0;
2169	if (LevS->GArray[0].GSq == 0.0) {
2170	Index = LevS->GArray[g].Index;
2171	a_ij += (LPsiDatA[0].reHGcRC[k][Index].re + LPsiDatA[0].imHGcRC[k][Index].im);
2172	b_ij += (LPsiDatA[0].reHGcR2C[k][Index].re + LPsiDatA[0].imHGcR2C[k][Index].im);
2173	g++;
2174	}
2175	for (; g < LevS->MaxG; g++) {
2176	Index = LevS->GArray[g].Index;
2177	a_ij += 2(LPsiDatA[g].reHGcRC[k][Index].re + LPsiDatA[g].im*HGcRC[k][Index].im);
2178	b_ij += 2(LPsiDatA[g].reHGcR2C[k][Index].re + LPsiDatA[g].im*HGcR2C[k][Index].im);
2179	} // due to the symmetry the resulting matrix element is real and symmetric in (i,i) ! (complex multiplication simplifies ...)
2180	MPI_Allreduce ( &a_ij, &A_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
2181	spread += pow(A_ij,2);
2182	}
2183	}
2184	MPI_Allreduce ( &b_ij, &B_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
2185
2186	// store spread in OnePsiElementAdd
2187	Psi->AddData[i].WannierSpread = B_ij - spread;
2188
2189	FreeSinCosLookupTable(cos_lookup,sin_lookup);
2190
2191	return (B_ij - spread);
2192	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: pcp/src/wannier.c@ f70c2a

Download in other formats: