source: util/src/dynamicANOVA.sh.in@ 03c204

Last change on this file since 03c204 was b562aa, checked in by Frederik Heber <heber@…>, 17 years ago

rewritten MultiRunSim to allow for minimum of rsh calls

MultiRunSim beforehand went through each fragment, one after the other, and called mpirun to commit it to a node. As the calculations with MPQC are so fast (roughly a second), this lead to a lot of rsh calls. The inetd of the job distributing node subsequently hanged itself after a short while. This was especially a problem in the BOSSANOVA scheme.
Now, we split the total sum of fragments up into as many packets as there are processor groups and commit them by single rsh call with all jobs in the packet concatenated with ";". Hence, we have a lot less rsh calls and now hangup of inetd. Note however, that for small molecules this still may lead to above described behaviour, i.e. if per packet there remains only one or two jobs. This cannot be overcome by any other mean than to lower the number of processor groups.

  • Property mode set to 100755
File size: 8.5 KB
Line 
1#!@SHELL@
2#
3# Performs an molecular dynamics simulation with the BOSSANOVA method
4
5#MPIRUN="/opt/packages/mpichgm-1.2.7..15/bin/mpirun.ch_gm"
6MPIRUN="/usr/bin/mpirun.mpich"
7exec_prefix="@prefix@"
8database="@bindir@"
9MOLECUILDER="@bindir@/molecuilder"
10JOINER="@bindir@/joiner"
11CRUNCHER="/mount/bespin/heber/build/mpqc-2.3.0/bin/mpqc"
12CONVERTER="/mount/bespin/heber/tmp/mpqc/espack2mpqc.py"
13PREPARER="/mount/bespin/heber/tmp/mpqc/convertresults.sh"
14
15function check()
16{
17 #1 MESSAGE
18 if [ $? -eq 0 ]; then
19 if [ -z $1 ]; then
20 echo "ok."
21 else
22 echo "ok: $1."
23 fi
24 else
25 if [ -z $1 ]; then
26 echo "failed."
27 else
28 echo "failed: $1."
29 fi
30 exit 1
31 fi
32}
33
34#function MultiRunSim {
35# # 1 is config file dir (with all files)
36# # 2 is the machine file
37#
38# ${JOBRUNNER} --mpqc ${CRUNCHER} -nprocpernode 2 -nprocperjob 1 -nthreadperproc 2 --threadgrp=posix --messagegrp=proc --memorygrp=proc --nodefile $2 --readdir $1 --inputprefix=${1}/ --outputprefix=${1}/ --autoout --verbose --rerun 2>/dev/stdout | tee -a dynamic.log
39#}
40
41function MultiRunSim {
42 # 1 is the number of groups
43 # 2 is the directory
44 # 3, ... are config files
45
46 # find the next free proc group
47 divisor=$1
48 shift
49 DIR=$1
50 shift
51 started=0
52 pwd=`pwd`
53 while [ $started -eq 0 ]; do
54 groupnr=1
55 while [ $groupnr -le $divisor ]; do
56 if [ ! -e "${DIR}/ProcRuns${groupnr}" ]; then
57 #MaxNodes=`cat ${DIR}/ProcGroup${groupnr} | awk 'END{print NR}'`
58 #gamma=`grep ProcPEGamma $1 | awk -F"\t" {'print $2'}`
59 #psi=`grep ProcPEPsi $1 | awk -F"\t" {'print $2'}`
60 #let nodes=$gamma*$psi
61 #if [ $nodes -gt $MaxNodes ]; then
62 # echo "Process $1 needs too many nodes! Breaking." | tee -a dynamic.log
63 # exit 1
64 #fi
65 nodes=1
66 echo "touch ${DIR}/ProcRuns${groupnr}" >"${DIR}/ProcBatch${groupnr}"
67 if [ ! -z $1 ]; then
68 echo -n "rsh `cat <${DIR}/ProcGroup${groupnr}` 'cd ${pwd}/${DIR}" >>"${DIR}/ProcBatch${groupnr}"
69 fi
70 while [ ! -z $1 ]; do # add all config files as single lines
71 #echo -n "${MPIRUN} -machinefile ${DIR}/ProcGroup${groupnr} -np $nodes " >>"${DIR}/ProcBatch${groupnr}"
72 echo -n "; ${CRUNCHER} -o ${1/conf/out} ${1/conf/in}" >>"${DIR}/ProcBatch${groupnr}"
73 shift
74 done
75 echo "'" >>"${DIR}/ProcBatch${groupnr}"
76 echo "rm -f ${DIR}/ProcRuns${groupnr}" >>"${DIR}/ProcBatch${groupnr}"
77 /bin/sh "${DIR}/ProcBatch${groupnr}" &
78 started=1
79 let groupnr=${divisor}+1
80 else
81 let groupnr=$groupnr+1
82 fi
83 done
84 # wait a few seconds
85 #if [ $2 -gt 1 ]; then
86 # sleep 2
87 #fi
88 done
89}
90
91# get command line options
92if [ -z $3 ]; then
93 echo "Usage: $0 <config file> <Order> <max. bond distance> <MaxNodes> [MaxMDsteps]"
94 echo -e "\t<config file> the pcp config file of the total molecule"
95 echo -e "\t<Order> the highest bond order (i.e. the cutoff number in ANOVA series expansion)"
96 echo -e "\t<max. bond distance> maximum distance to look for bonds (bonds are associated by element covalent radii criterion)"
97 echo -e "\t[MaxMDSteps] overrides given MaxOuterStep in config file"
98 exit 1;
99else
100 arg=$1
101 mainname=`grep mainname $arg | awk -F"\t" {'print $2'}`
102 order=$2
103 distance=$3
104 if [ -z $4 ]; then
105 MaxSteps=`grep MaxOuterStep $arg | awk -F"\t" {'print $2'}`
106 else
107 MaxSteps=$4
108 fi
109 echo "Going to run for a total of $MaxSteps steps, bond order $order and maximum distance $distance of config file $arg." | tee -a dynamic.log
110fi
111
112
113# get the directory
114DIR=`dirname $arg`
115if [ -z "`grep $DIR $arg`" ]; then
116 echo "Cannot find the directory $DIR in the config file." | tee -a dynamic.log
117 exit 1;
118else
119 echo "Using $DIR as directory." | tee -a dynamic.log
120fi
121
122PBS_NODEFILE="${DIR}/machines"
123if [ ! -e $PBS_NODEFILE ]; then
124 echo "localhost" >$PBS_NODEFILE
125fi
126
127# delete old processor group files
128rm ${DIR}/ProcGroup* -f
129rm ${DIR}/ProcRuns* -f
130rm ${DIR}/ProcBatch* -f
131
132# put nodes into groups
133MaxNodes=0
134for node in `cat <$PBS_NODEFILE`; do
135 let MaxNodes=$MaxNodes+1
136done
137gamma=`grep ProcPEGamma $arg | awk -F"\t" {'print $2'}`
138psi=`grep ProcPEPsi $arg | awk -F"\t" {'print $2'}`
139let nodes=$gamma*$psi
140let divisor=$MaxNodes/$nodes
141echo "Using $divisor processor groups." | tee -a dynamic.log
142nodenr=0
143groupnr=1
144for node in `cat <$PBS_NODEFILE`; do
145 let nodenr=$nodenr+1
146 #echo "Current node $nodenr is $node." | tee -a dynamic.log
147 let currentgrouplimit=$groupnr*$nodes
148 if [ $currentgrouplimit -lt $nodenr ]; then
149 let groupnr=$groupnr+1
150 fi
151 #echo "Putting into group $groupnr." | tee -a dynamic.log
152 echo "$node" >>"${DIR}/ProcGroup${groupnr}"
153done
154i=0
155while [ $i -lt $groupnr ]; do
156 let i=$i+1
157 echo "Group nr. $i" | tee -a dynamic.log
158 echo "===========" | tee -a dynamic.log
159 cat <"${DIR}/ProcGroup${i}"
160 cat <"${DIR}/ProcGroup${i}" >>dynamic.log
161 echo -e "\n" | tee -a dynamic.log
162done
163
164# copy first conf
165cp $arg ${arg}.MD
166
167
168i=1;
169while [ $i -le $MaxSteps ]; do
170# break down the molecule with molecuilder
171 echo -n "Fragmenting ... " | tee -a dynamic.log
172 ${MOLECUILDER} ${arg}.MD -e ${database} -f $distance $order 2>/dev/null >/dev/null
173 check | tee -a dynamic.log
174 echo "done." | tee -a dynamic.log
175
176# get the number of digits of the fragment count
177 digits=1
178 while [ ! -e ${DIR}/BondFragment`printf "%0${digits}d" 0`.conf ]; do
179 let digits=$digits+1
180 done
181 echo "Found $digits digits for the fragment number." | tee -a dynamic.log
182
183# get the fragment count
184 frag=0
185 while [ -e ${DIR}/BondFragment`printf "%0${digits}d" $frag`.conf ]; do
186 # unset MaxOuterStep in config file
187 sed -i -e "s#MaxOuterStep.*\##MaxOuterStep\t0\t\##" ${DIR}/BondFragment`printf "%0${digits}d" $frag`.conf
188 rm -rf ${DIR}/BondFragment`printf "%0${digits}d" $frag`
189 let frag=$frag+1
190 done
191 echo "There are $frag fragments." | tee -a dynamic.log
192
193
194# evaluate each fragment
195# j=0
196# while [ $j -lt $frag ]; do
197# number=`printf "%0${digits}d" $j`
198# # convert all configs
199# echo -n "Converting ${DIR}/BondFragment${number}.conf ..." | tee -a dynamic.log
200# sh $CONVERTER ${DIR}/BondFragment${number}.conf
201# check | tee -a dynamic.log
202# let j=$j+1
203# done
204#
205# MultiRunSim ${DIR} $PBS_NODEFILE
206#
207# j=0
208# while [ $j -lt $frag ]; do
209# number=`printf "%0${digits}d" $j`
210# # rename output files
211# echo -n "Renaming `ls ${DIR}/BondFragment${number}.out.001.02.02` ..." | tee -a dynamic.log
212# mv ${DIR}/BondFragment${number}.out.001.02.02 ${DIR}/BondFragment${number}.out
213# check | tee -a dynamic.log
214# let j=$j+1
215# done
216
217 # reset command arrays
218 grp=0;
219 while [ $grp -lt $divisor ]; do
220 command[$grp]=""
221 let grp=$grp+1
222 done
223
224 # distribute the jobs among the groups
225 j=0;
226 while [ $j -lt $frag ]; do
227 number=`printf "%0${digits}d" $j`
228 # convert all configs
229 #echo -n "Converting ${DIR}/BondFragment${number}.conf ..." | tee -a dynamic.log
230 #sh $CONVERTER ${DIR}/BondFragment${number}.conf
231 #check | tee -a dynamic.log
232 # and distribute
233 let grp=${j}%${divisor}
234 #echo "BondFragment${number}.conf is evaluated by group $grp."
235 command[$grp]="${command[$grp]}BondFragment${number}.conf "
236 let j=$j+1
237 done
238
239 # go through all groups and run the job
240 grp=0;
241 while [ $grp -lt $divisor ]; do
242 number=`printf "%0${digits}d" $j`
243 echo -n "Starting calculation of group $grp with fragments \"${command[$grp]}\" at step $i ... " | tee -a dynamic.log
244 MultiRunSim $divisor ${DIR} ${command[$grp]}
245 echo "done." | tee -a dynamic.log
246 let grp=$grp+1
247 done
248
249# wait till all ProcRuns files are gone
250# if [ $divisor -gt 1 ]; then
251 echo "Waiting for all running jobs at step $i to end ... " | tee -a dynamic.log
252 while [ ! -z "`find ${DIR} -name 'ProcRuns*'`" ]; do
253 #if [ ! -z "`find ${DIR} -name 'ProcRuns*'`" ]; then
254 # echo "still `ls ${DIR}/ProcRuns*` present"
255 #fi
256 sleep 1
257 done
258 echo "done." | tee -a dynamic.log
259# fi
260
261
262# convert results
263 sleep 1 # necessary for result files to close
264 echo -n "Converting all results ... " | tee -a dynamic.log
265 sh $PREPARER $DIR
266 check | tee -a dynamic.log
267
268# join the resulting forces into a single file
269 echo -n "Joining fragment energies ... " | tee -a dynamic.log
270 ${JOINER} ${DIR}/ $mainname >/dev/null 2>/dev/null
271 check | tee -a dynamic.log
272 echo "done." | tee -a dynamic.log
273
274# move the ions by calling pcp with this force file
275 sed -i -e "s#MaxOuterStep.*\##MaxOuterStep\t$i\t\##" ${arg}.MD
276 echo -n "Moving ions with obtained forces at step $i ... " | tee -a dynamic.log
277 $MOLECUILDER ${arg}.MD -e ${database} -P "${DIR}/pcp.Order${order}.forces.all" 2>/dev/null >/dev/null
278 echo "done" | tee -a dynamic.log
279
280# last of all, put "joined" energy and forces under this step
281 cp ${DIR}/pcp.Order${order}.energy.all ${DIR}/pcp.step${i}.energy.all
282 cp ${DIR}/pcp.Order${order}.forces.all ${DIR}/pcp.step${i}.forces.all
283
284# next step
285 let i=$i+1
286done
287
288exit 0
Note: See TracBrowser for help on using the repository browser.