Blog: Walking the Chemical Space: main.py

File main.py, 17.3 KB (added by FrederikHeber, 2 years ago)

Main script to generate configurations, run using python and putting pyMoleCuilder into PYTHONPATH

Line 
1# (C) 2020 Frederik Heber
2#
3# This script randomly "grows" a molecule by using the following actions by
4# pyMoleCuilder:
5# - SaturateAtoms
6# - ChangeElement
7#
8# This is to explore the space of chemical graphs.
9#
10# NOTE: For this to work, we need to point PYTHONPATH to a molecuilder installation
11# directory.
12import os, sys, random
13from itertools import chain, combinations
14
15import pyMoleCuilder as mol
16
17
18counter = 0
19counter_all = 0
20
21
22
23
24typical_distance = {
25 "H": 0.6,
26 "C": 1.,
27 "N": 1.,
28 "O": 1.,
29 "P": 1.,
30 "S": 1.
31}
32valencies = {
33 "H": 1,
34 "C": 4,
35 "N": 3,
36 "O": 2,
37# "P": 3,
38# "S": 2
39}
40atomicNumbers = {
41 1: "H",
42 6: "C",
43 7: "N",
44 8: "O",
45 15: "P",
46 16: "S",
47}
48elements = valencies.keys()
49
50MAX_DEGREE = 3
51
52THEORY = "MBPT2"
53BASISNAME = "6-311G"
54
55homology_container_filename = "homologies_"+THEORY+"_"+BASISNAME+".dat"
56atomfragments_filename = "atomfragments_"+THEORY+"_"+BASISNAME+".dat"
57
58#random.seed(426)
59SEED = random.randint(0, 1e8)
60
61
62def callCounter(function_name, *args, **kwargs):
63 global counter
64 global counter_all
65 counter_all += 1
66 # increase on actions, decrease on undos
67 if function_name == "Undo":
68 counter -= 1
69 elif 'A' <= function_name[0] <= 'Z':
70 # skip all lower case commands (which are pure python without side effects)
71 counter += 1
72 method_to_call = getattr(mol, function_name)
73 return method_to_call(*args, **kwargs)
74
75
76def add_seed_atom(element):
77 # has side effects
78 callCounter("SelectionAllMolecules")
79 callCounter("AtomAdd", add_atom = element, domain_position="10,10,10")
80
81
82def change_element(element):
83 # has side effects
84 callCounter("AtomChangeElement", change_element = element)
85 callCounter("FragmentationClearFragmentationState")
86 callCounter("wait")
87
88
89def saturate_all_atoms():
90 # has side effects
91 callCounter("SelectionAllAtoms")
92 callCounter("AtomSaturate", use_outer_shell = "1")
93 callCounter("SelectionClearAllAtoms")
94 callCounter("wait")
95
96
97def select_atoms_randomly(number):
98 # has side effects
99 callCounter("SelectionClearAllAtoms")
100 callCounter("SelectionAtomByRandom", select_atom_by_random = str(number))
101 callCounter("wait")
102
103def check_for_present_hydrogens():
104 # has no side effects
105 callCounter("SelectionPushAtoms")
106 callCounter("SelectionClearAllAtoms")
107 callCounter("SelectionAtomByElement", "H")
108 callCounter("wait")
109 hydrogen_ids = callCounter("getSelectedAtomIds")
110 callCounter("SelectionPopAtoms")
111 if len(hydrogen_ids) == 0:
112 return False
113 return True
114
115
116def read_lastline(f):
117 f.seek(-2,2)##
118 while f.read(1) != b'\n':
119 f.seek(-2, 1)
120 return f.read()
121
122
123def get_element_of_selected_atom():
124 # has no side effects
125 e = callCounter("getSelectedAtomElements")
126 return atomicNumbers[int(e[0])]
127
128
129pair_correlation_filename = "paircorrelation.dat"
130pair_correlation_bin_filename = "paircorrelation_bins.dat"
131
132
133def check_pair_correlation_distance(typical_distance, element):
134 # has no side effects
135 bin_end=typical_distance+0.1
136 mol.AnalysisPairCorrelation(
137 elements=element,
138 bin_start="0.", bin_width="0.1", bin_end=str(bin_end),
139 output_file=pair_correlation_filename,
140 bin_output_file=pair_correlation_bin_filename,
141 periodic="0"
142 )
143 callCounter("wait")
144 bin = bin_end
145 with open(pair_correlation_bin_filename, "r") as f:
146 header = f.readline()
147 assert( header.split()[3] == "Count")
148 content = f.readlines()
149 for line in content:
150 fields = line.split()
151 if int(fields[3]) != 0:
152 bin = float(fields[0])
153 print("First non-zero bin to "+element+" is at "+str(bin))
154 break
155 #[os.remove(f) for f in [filename, bin_filename]]
156 callCounter("wait")
157 return float(bin) > typical_distance
158
159
160def extract_atom_name(text):
161 return text.split(",")[0]
162
163
164def parse_pair_correlation_distance(min_distance):
165 returnlist = []
166 with open(pair_correlation_filename, "r") as f:
167 header = f.readline()
168 assert( header.split()[0] == "BinStart")
169 content = f.readlines()
170 for line in content:
171 fields = line.split()
172 binstart = float(fields[0])
173 if binstart < min_distance:
174 returnlist.append(extract_atom_name(fields[3]))
175 return returnlist
176
177
178def get_mean_position_of_selected_atoms():
179 # has no side effects
180 mean_pos = [0., 0., 0.]
181 callCounter("wait")
182 positions = callCounter("getSelectedAtomPositions")
183 if len(positions) > 0:
184 for pos in positions:
185 for i in range(3):
186 mean_pos[i] += pos[i]
187 for i in range(3):
188 mean_pos[i] /= len(positions)
189 return mean_pos
190
191
192def get_all_hydrogen_bond_neighbors(current_id):
193 # has no side effects
194 callCounter("SelectionPushAtoms")
195 callCounter("SelectionClearAllAtoms")
196 callCounter("SelectionAtomById", str(current_id))
197 callCounter("SelectionAtomBondNeighbors")
198 for other_element in non_hydrogen_elements:
199 callCounter("SelectionNotAtomByElement", other_element)
200 # not needed as it is contained in elements
201 # callCounter("SelectionNotAtomById", str(current_id))
202 callCounter("wait")
203 hydrogen_ids = callCounter("getSelectedAtomIds")
204 print("Hydrogens are "+str([str(h) for h in hydrogen_ids]))
205 callCounter("SelectionPopAtoms")
206 return hydrogen_ids
207
208
209def prepare_run(bond_table, filename = ""):
210 # has side effects
211 if len(filename) > 0:
212 print("Using file "+filename)
213 if filename.rfind('.') == -1:
214 prefix = filename
215 else:
216 prefix = filename[0:filename.rfind('.')]
217 callCounter("WorldInput", filename)
218 print("Using prefix " + prefix)
219 callCounter("ParserSetOutputFormats", "tremolo")
220 callCounter("ParserSetTremoloAtomdata", "Id x=3 u=3 type neighbors=8")
221 callCounter("ParserSetParserParameters", "mpqc", "theory="+THEORY+";basis="+BASISNAME+";")
222 callCounter("CommandSetRandomNumbersEngine", set_random_number_engine="mt19937", random_number_engine_parameters='seed='+str(SEED))
223 callCounter("CommandBondLengthTable", bond_table)
224 callCounter("WorldChangeBox", "20,0,0,20,0,20")
225 callCounter("wait")
226 return prefix
227
228
229def parse_containers():
230 # has side effects
231 if os.path.exists(homology_container_filename):
232 callCounter("PotentialParseHomologies", homology_container_filename)
233 if os.path.exists(atomfragments_filename):
234 callCounter("PotentialParseAtomFragments", atomfragments_filename)
235 callCounter("wait")
236
237
238def store_containers():
239 # has side effects
240 callCounter("PotentialSaveHomologies", homology_container_filename)
241 callCounter("PotentialSaveAtomFragments", atomfragments_filename)
242 callCounter("wait")
243
244
245class FolderScope(object):
246 '''
247 Scope to step into a folder and automatically step back on exit
248 '''
249 def __init__(self, folder_name):
250 self.pwd = os.getcwd()
251 self.folder_name = folder_name
252 def __enter__(self):
253 if not os.path.exists(self.folder_name):
254 os.makedirs(self.folder_name)
255 os.chdir(self.folder_name)
256 def __exit__(self, type, value, traceback):
257 os.chdir(self.pwd)
258
259
260class UndoScope(object):
261 '''
262 Scope to automatically undo all molecuilder actions executed within
263 its scope.
264 '''
265 def __init__(self):
266 # print("ENTERing UndoScope at " + str(counter))
267 callCounter("CommandUndoMark", "1")
268 def __enter__(self):
269 mol.wait()
270 def __exit__(self, type, value, traceback):
271 # print("EXITing UndoScope at " + str(counter))
272 callCounter("Undo", till_mark="1")
273 mol.wait()
274
275
276def get_mean_position_of_hydrogens(current_position):
277 # has no side effects
278 mean_pos = get_mean_position_of_selected_atoms()
279 # print("Mean positions " + str(mean_pos))
280
281 # compare to position of non-hydrogen atom
282 if sum([pow(current_position[i] - mean_pos[i], 2) for i in range(3)]) < 0.1:
283 # too close to non-hydrogen, hence pick the first of the hydrogen positions
284 print("Mean position " + str(mean_pos) + " is too close to non-hydrogen " + str(
285 current_position) + ", using one of the hydrogens.")
286 callCounter("wait")
287 mean_pos = callCounter("getSelectedAtomPositions")[0]
288 return mean_pos
289
290
291def replace_hydrogens_by_atom_rescale_and_saturate(pick_element, mean_pos, bond_degree, current_id):
292 # has side effects
293 # remove hydrogens and add new atom
294 callCounter("SelectionAllMolecules")
295 callCounter("AtomRemove")
296 mol.AtomAdd(add_atom=pick_element,
297 domain_position='{},{},{}'.format(mean_pos[0], mean_pos[1], mean_pos[2]))
298
299 # set bond degree and scale bond distance to typical distance
300 callCounter("SelectionAtomByOrder", '-1')
301 callCounter("SelectionAtomById", str(current_id))
302 callCounter("BondAdd")
303 callCounter("BondSetDegree", str(bond_degree))
304 #callCounter("GraphUpdateMolecules")
305 callCounter("MoleculeStretchBond", stretch_bond="0.")
306
307 # bondify and saturate
308 callCounter("SelectionNotAtomById", str(current_id))
309 callCounter("AtomBondify", max_hydrogens="1")
310 callCounter("AtomSaturate", use_outer_shell = "1")
311 callCounter("SelectionClearAllAtoms")
312 callCounter("wait")
313
314
315def calculate_energy(loop_prefix):
316 # has side effects
317 callCounter("SelectionPushAtoms")
318 callCounter("SelectionAllAtoms")
319 callCounter("AtomRandomPerturbation", random_perturbation="0.01")
320 callCounter("FragmentationClearFragmentationState")
321 callCounter("FragmentationFragmentation",
322 fragment_molecule=loop_prefix,
323 DoSaturate="1",
324 ExcludeHydrogen="1",
325 order="3",
326 output_types="",
327 parse_state_files="0"
328 )
329 callCounter("FragmentationFragmentationAutomation",
330 server_address="eumaios",
331 server_port="20024",
332 DoLongrange="0"
333 )
334 callCounter("FragmentationAnalyseFragmentationResults")
335 unique_filename = ensure_unique_filename(loop_prefix, 'FragmentResults.dat')
336 callCounter("FragmentationSaveFragmentResults", save_fragment_results=unique_filename)
337 callCounter("SelectionPopAtoms")
338 callCounter("wait")
339
340
341def get_unique_filename(num_atoms):
342 # has no side effects
343 callCounter("SelectionPushAtoms")
344 callCounter("SelectionClearAllAtoms")
345 callCounter("SelectionAllAtoms")
346 callCounter("SelectionNotAtomByElement", "H")
347 callCounter("wait")
348 atom_ids_check = callCounter("getSelectedAtomIds")
349 assert (len(atom_ids_check) == num_atoms)
350 graph6_string = callCounter("getGraph6String")
351 # hard code canonical form for N=3
352 if graph6_string in ["Bo", "Bg"]:
353 graph6_string = "BW"
354 elementlist_string = callCounter("getElementListAsString")
355 unique_filename = graph6_string+"_-_"+elementlist_string.replace(" ", "_")
356 print("Picking "+unique_filename+" as filename.")
357 callCounter("SelectionPopAtoms")
358 return unique_filename
359
360
361def ensure_unique_filename(prefix, suffix):
362 unique_filename = prefix+suffix
363 if os.path.exists(unique_filename):
364 for i in range(1000):
365 if not os.path.exists(prefix + "-" + str(i) + suffix):
366 unique_filename = prefix + "-" + str(i) + suffix
367 break
368 return unique_filename
369
370
371def extend_graph_by_element(loop_prefix, pick_element, current_id, current_position, hydrogen_ids):
372 # has side effects
373 callCounter("SelectionClearAllAtoms")
374 callCounter("SelectionAtomById", " ".join([str(item) for item in hydrogen_ids]))
375 callCounter("wait")
376
377 # get mean of their positions
378 bond_degree = len(hydrogen_ids)
379 assert( 1 <= bond_degree <= 3)
380 mean_pos = get_mean_position_of_hydrogens(current_position)
381
382 # add atom, add bond, rescale, saturate, bondify
383 replace_hydrogens_by_atom_rescale_and_saturate(pick_element, mean_pos, bond_degree, current_id)
384
385
386def powerset(iterable):
387 "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
388 s = list(iterable)
389 return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))
390
391
392def get_all_non_hydrogen_atoms():
393 callCounter("SelectionPushAtoms")
394 callCounter("SelectionClearAllAtoms")
395 callCounter("SelectionAllAtoms")
396 callCounter("SelectionNotAtomByElement", "H")
397 callCounter("wait")
398 atom_ids = callCounter("getSelectedAtomIds")
399 callCounter("SelectionPopAtoms")
400 return atom_ids
401
402
403def get_all_hydrogen_atoms():
404 callCounter("SelectionPushAtoms")
405 callCounter("SelectionClearAllAtoms")
406 callCounter("SelectionAtomByElement", "H")
407 callCounter("wait")
408 hydrogen_ids = callCounter("getSelectedAtomIds")
409 callCounter("SelectionPopAtoms")
410 return hydrogen_ids
411
412
413def get_position_of_id(current_id):
414 callCounter("SelectionPushAtoms")
415 callCounter("SelectionClearAllAtoms")
416 callCounter("SelectionAtomById", str(current_id))
417 callCounter("wait")
418 current_position = callCounter("getSelectedAtomPositions")[0]
419 callCounter("SelectionPopAtoms")
420 return current_position
421
422
423def recurse(num_atoms, left_atoms):
424 if left_atoms <= 0:
425 return
426
427 loop_prefix = prefix + '-' + str(num_atoms-left_atoms)
428
429 # check if we can continue due to present hydrogens
430 if not check_for_present_hydrogens():
431 print("Stopping because there are no more hydrogens in the fully saturated system.")
432 return
433
434 atom_ids = get_all_non_hydrogen_atoms()
435 assert( left_atoms == num_atoms-len(atom_ids) )
436
437 # go through every element
438 for pick_element in non_hydrogen_elements:
439 print("Current element is " + pick_element)
440
441 # go through all atoms
442 for current_id in atom_ids:
443 print("Current id is " + str(current_id))
444 current_position = get_position_of_id(current_id)
445
446 # get all bonded hydrogen ids of this element
447 hydrogen_ids = get_all_hydrogen_bond_neighbors(current_id)
448 print("Non-hydrogen atom has " + str(len(hydrogen_ids)) + " hydrogens")
449
450 # pick every combination of hydrogen_ids till a maximum bond degree
451 for pick_hydrogen_ids in powerset(hydrogen_ids):
452 max_degree = min(MAX_DEGREE, valencies[pick_element])
453 if 0 < len(pick_hydrogen_ids) <= max_degree:
454 print("Current set in powerset is " + str(pick_hydrogen_ids))
455 with UndoScope():
456 extend_graph_by_element(loop_prefix, pick_element, current_id, current_position,
457 pick_hydrogen_ids)
458 atom_ids_check = get_all_non_hydrogen_atoms()
459 hydrogen_ids_check = get_all_hydrogen_atoms()
460 print("Non-hydrogens " +str(atom_ids_check)+", hydrogens "+str(hydrogen_ids_check))
461 assert( all([i not in hydrogen_ids_check for i in pick_hydrogen_ids]))
462 #assert( all([i not in atom_ids_check for i in pick_hydrogen_ids]))
463
464 # save current state
465 if left_atoms == 1:
466 unique_filename = ensure_unique_filename(loop_prefix + "_-_" + get_unique_filename(num_atoms), ".data")
467 callCounter("WorldOutputAs", unique_filename)
468 callCounter("wait")
469
470 # calculate energy
471 # calculate_energy(loop_prefix)
472 else:
473 recurse(num_atoms, left_atoms-1)
474
475
476# main function
477if __name__ == '__main__':
478 prefix = "ChemicalGraph"
479 if len(sys.argv) <= 3 :
480 print("Usage: "+sys.argv[0]+" <number of final non-hydrogen atoms> <bond_table> [prefix]")
481 sys.exit(1)
482
483 num_atoms = int(sys.argv[1])
484 bond_table = sys.argv[2]
485 filename = sys.argv[3] if len(sys.argv) > 3 else ""
486 prefix = prepare_run(bond_table = bond_table, filename = filename)
487
488 callCounter("CommandVerbose", "1")
489
490 # parsing homologies works as deserialization, i.e. it does not append
491 # hence, we parse the current state here and later update it
492 # parse_containers()
493
494 callCounter("SelectionAllAtoms")
495 callCounter("AtomRemove")
496 callCounter("SelectionAllMolecules")
497 callCounter("MoleculeRemove")
498 # this may fail. Hence, wait here
499 callCounter("wait")
500
501 # create the test folder and step into
502 with FolderScope(prefix):
503
504 # loop init
505 non_hydrogen_elements = list(filter(lambda x: x != "H", valencies.keys()))
506 for initial_element in non_hydrogen_elements:
507 print("Using "+initial_element+" as initial element")
508 add_seed_atom(initial_element)
509 saturate_all_atoms()
510
511 # save current state and calculate energy if N=1
512 if num_atoms==1:
513 unique_filename = ensure_unique_filename(prefix+'-1_-_' + get_unique_filename(num_atoms), ".data")
514 callCounter("WorldOutputAs", unique_filename)
515 callCounter("wait")
516
517 # calculate energy
518 # calculate_energy(prefix+'-1')
519
520 # recursion
521 recurse(num_atoms, num_atoms-1)
522
523 # tabula rasa for next seed atom
524 callCounter("SelectionAllAtoms")
525 callCounter("AtomRemove")
526
527 # exit
528 #callCounter("WorldOutput")
529 callCounter("wait")
530 #store_containers()
531
532 print("finished, "+str(counter_all)+" actions called.")