/ase/data/tmxr200x.py
Python | 256 lines | 218 code | 16 blank | 22 comment | 30 complexity | 5ddde2b39339af8a11bd81693e80a624 MD5 | raw file
1import os
2import pprint
3import re
4from urllib import urlretrieve
5import zipfile
6
7import datetime
8
9import numpy as np
10
11from ase.utils import popen3
12
13import ase.io
14from ase.atom import Atom
15from ase.atoms import Atoms
16from ase.data import atomic_numbers, chemical_symbols
17from ase.data import ground_state_magnetic_moments
18
19# Transition Metals First-row (TM1R): 10.1021/ct6001187 # 32 compounds
20# Transition Metals Second-row (TM2R): 10.1021/ct700178y # 19 compounds
21# Transition Metals Third-row (TM3R): 10.1021/ct800172j # 25 compounds
22
23#http://pubs.acs.org/doi/suppl/10.1021/ct6001187/suppl_file/ct6001187-file002.pdf
24#http://pubs.acs.org/doi/suppl/10.1021/ct700178y/suppl_file/ct700178y-file002.pdf
25#http://pubs.acs.org/doi/suppl/10.1021/ct800172j/suppl_file/ct800172j_si_001.pdf
26
27url_root = 'http://pubs.acs.org/doi/suppl/'
28journal = '10.1021'
29database_files = {
30 'TM1R2006': {'doi': journal + '/ct6001187', 'module': 'TMXR200X_TM1R2006'},
31 'TM2R2007': {'doi': journal + '/ct700178y', 'module': 'TMXR200X_TM2R2007'},
32 'TM3R2008': {'doi': journal + '/ct800172j', 'module': 'TMXR200X_TM3R2008'},
33 }
34
35database_files['TM1R2006']['pdf'] = database_files['TM1R2006']['doi'] + '/suppl_file/ct6001187-file002.pdf'
36
37database_files['TM2R2007']['pdf'] = database_files['TM2R2007']['doi'] + '/suppl_file/ct700178y-file002.pdf'
38
39database_files['TM3R2008']['pdf'] = database_files['TM3R2008']['doi'] + '/suppl_file/ct800172j_si_001.pdf'
40
41def download_file(url, filename, dir='.'):
42 # do not mirror subdirectory structure of url
43 outfile = os.path.join(dir, os.path.basename(filename))
44 if 0: # fails, use files from disk
45 urlretrieve(os.path.join(url, filename), outfile)
46 return outfile
47
48def read_geometries(filename, dir='.'):
49 txt = os.path.join(dir, filename)
50 fh = open(txt, 'rb')
51 table = fh.read()
52 firstsplit = '(in xyz format):' # TM1R2006 and TM2R2007
53 dataformat = 'xyz'
54 if table.find('(Gaussian archive entries):') != -1:
55 firstsplit = '(Gaussian archive entries):' # TM3R2008
56 dataformat = 'gaussian'
57 table = table.split(firstsplit)
58 table = table[1]
59 # remove one or two digit numbers (page numbers/numbers of atoms in xyz format)
60 table = re.sub('\n\d\d\n', '\n', table)
61 table = re.sub('\n\d\n', '\n', table)
62 # remove S + two digit numbers (page numbers)
63 table = re.sub('\nS\d\d\n', '\n', table)
64 # remove S + one digit (page numbers)
65 table = re.sub('\nS\d\n', '\n', table)
66 # remove empty lines
67 # http://stackoverflow.com/questions/1140958/whats-a-quick-one-liner-to-remove-empty-lines-from-a-python-string
68 table = os.linesep.join([s for s in table.splitlines() if s])
69 geometries = []
70 if dataformat == 'xyz':
71 # split on new lines
72 table = table.split('\n')
73 # mark compound names with ':' tags
74 for n, line in enumerate(table):
75 if not (line.find('.') != -1):
76 # remove method/basis set information
77 table[n] = table[n].replace(' BP86/qzvp', '')
78 table[n] = ':' + table[n] + ':'
79 table = '\n'.join([s for s in table])
80 # split into compounds
81 # http://simonwillison.net/2003/Oct/26/reSplit/
82 # http://stackoverflow.com/questions/647655/python-regex-split-and-special-character
83 table = re.compile('(:.*:)').split(table)
84 # remove empty elements
85 table = [l.strip() for l in table]
86 table = [l for l in table if len(l) > 1]
87 # extract compounds
88 for n in range(0, len(table), 2):
89 compound = table[n].replace(':', '').replace(' ', '_')
90 geometry = []
91 for atom in table[n+1].split('\n'):
92 geometry.append(Atom(symbol=atom.split()[0], position=atom.split()[1:]))
93 atoms = Atoms(geometry)
94 # set the charge and magnetic moment on the heaviest atom (better ideas?)
95 heaviest = max([a.get_atomic_number() for a in atoms])
96 heaviest_index = [a.get_atomic_number() for a in atoms].index(heaviest)
97 charge = 0.0
98 if abs(charge) > 0.0:
99 charges = [0.0 for a in atoms]
100 charges[heaviest_index] = charge
101 atoms.set_initial_charges(charges)
102 if compound in [ # see corresponding articles
103 'Ti(BH4)3', # TM1R2006
104 'V(NMe2)4', # TM1R2006
105 'Cu(acac)2', # TM1R2006
106 'Nb(Cp)(C7H7)_Cs', # TM2R2007
107 'CdMe_C3v', # TM2R2007
108 ]:
109 multiplicity = 2.0
110 else:
111 multiplicity = 1.0
112 if multiplicity > 1.0:
113 magmoms = [0.0 for a in atoms]
114 magmoms[heaviest_index] = multiplicity - 1
115 atoms.set_initial_magnetic_moments(magmoms)
116 geometries.append((compound, atoms))
117 elif dataformat == 'gaussian':
118 # remove new lines
119 table = table.replace('\n', '')
120 # fix: MeHg(Cl) written as MeHg(CN)
121 table = table.replace(
122 'MeHg(CN), qzvp (SDD/def-qzvp for metal)\\\\0,1\\Hg,0.,0.,0.1975732257',
123 'MeHg(Cl), qzvp (SDD/def-qzvp for metal)\\\\0,1\\Hg,0.,0.,0.1975732257')
124 # split on compound end marks
125 table = table.split('\\\@')
126 # remove empty elements
127 table = [l.strip() for l in table]
128 table = [l for l in table if len(l) > 1]
129 # extract compounds
130 for n, line in enumerate(table):
131 # split on gaussian separator '\\'
132 entries = line.split('\\\\')
133 compound = entries[2].split(',')[0].split(' ')[0]
134 # charge and multiplicity from gaussian archive
135 charge, multiplicity = entries[3].split('\\')[0].split(',')
136 charge = float(charge)
137 multiplicity = float(multiplicity)
138 if compound in ['Au(Me)PMe3']: # in gzmat format!
139 # check openbabel version (babel >= 2.2 needed)
140 cmd = popen3('babel -V')[1]
141 output = cmd.read().strip()
142 cmd.close()
143 v1, v2, v3 = output.split()[2].split('.')
144 v1, v2, v3 = int(v1), int(v2), int(v3)
145 if not (v1 > 2 or ((v1 == 2) and (v2 >= 2))):
146 print compound + ': skipped - version of babel does not support gzmat format'
147 continue # this one is given in z-matrix format
148 finame = compound.replace('(', '').replace(')', '') + '.orig'
149 foname = finame.split('.')[0] + '.xyz'
150 fi = open(finame, 'w')
151 fo = open(foname, 'w')
152 if 1: # how to extract zmat by hand
153 zmat = ['#'] # must start with gaussian input start
154 zmat.extend('@') # separated by newline
155 zmat.extend([compound])
156 zmat.extend('@') # separated by newline
157 zmat.extend([str(int(charge)) + ' ' + str(int(multiplicity))])
158 zmat.extend(entries[3].replace(',', ' ').split('\\')[1:])
159 zmat.extend('@') # atom and variable definitions separated by newline
160 zmat.extend(entries[4].split('\\'))
161 zmat.extend('@') # end with newline
162 for l in zmat:
163 fi.write(l.replace('@', '').replace('=', ' ') + '\n')
164 fi.close()
165 if 0:
166 # or use the whole gausian archive entry
167 entries = ''.join(entries)
168 fi.write(entries)
169 # convert gzmat into xyz using openbabel (babel >= 2.2 needed)
170 cmd = popen3('babel -i gzmat ' + finame + ' -o xyz ' + foname)[2]
171 error = cmd.read().strip()
172 cmd.close()
173 fo.close()
174 if not (error.find('0 molecules') != -1):
175 atoms = ase.io.read(foname)
176 else:
177 print compound + ': babel conversion failed'
178 continue # conversion failed
179 else:
180 positions = entries[3].replace(',', ' ').split('\\')[1:]
181 geometry = []
182 for k, atom in enumerate(positions):
183 geometry.append(Atom(symbol=atom.split()[0],
184 position=[float(p) for p in atom.split()[1:]]))
185 atoms = Atoms(geometry)
186 #
187 # set the charge and magnetic moment on the heaviest atom (better ideas?)
188 heaviest = max([a.get_atomic_number() for a in atoms])
189 heaviest_index = [a.get_atomic_number() for a in atoms].index(heaviest)
190 if abs(charge) > 0.0:
191 charges = [0.0 for a in atoms]
192 charges[heaviest_index] = charge
193 atoms.set_initial_charges(charges)
194 if multiplicity > 1.0:
195 magmoms = [0.0 for a in atoms]
196 magmoms[heaviest_index] = multiplicity - 1
197 atoms.set_initial_magnetic_moments(magmoms)
198 geometries.append((compound, atoms))
199 return geometries
200
201def pdftotext(filename):
202 os.system('pdftotext -raw -nopgbrk '+ filename)
203 return os.path.splitext(filename)[0] + '.txt'
204
205from ase.data.gmtkn30 import format_data
206
207def main():
208 if not os.path.isdir('TMXR200X'):
209 os.makedirs('TMXR200X')
210 #for database in ['TM1R2006']:
211 for database in database_files.keys():
212 fh = open(database_files[database]['module'].lower() + '.py', 'w')
213 fh.write('# Computer generated code! Hands off!\n')
214 fh.write('# Generated: ' + str(datetime.date.today()) + '\n')
215 fh.write('from numpy import array\n')
216 fh.write('data = ')
217 data = {} # specification of molecules
218 info = {} # reference/calculation info
219 # download structures
220 file = database_files[database]['pdf']
221 f = os.path.abspath(download_file(url_root, file, dir='TMXR200X'))
222 f = pdftotext(f)
223 geometries = read_geometries(f)
224 # set number of unpaired electrons and charges
225 no_unpaired_electrons = []
226 charges = []
227 for a in geometries:
228 magmom = sum(a[1].get_initial_magnetic_moments())
229 if magmom > 0.0:
230 no_unpaired_electrons.append((a[0], magmom))
231 charge = sum(a[1].get_charges())
232 if abs(charge) > 0.0:
233 charges.append((a[0], charge))
234 data = format_data(database, geometries, no_unpaired_electrons, charges)
235 # all constituent atoms
236 atoms = []
237 for formula, geometry in geometries:
238 atoms.extend(list(set(geometry.get_chemical_symbols())))
239 atoms=list(set(atoms))
240 atoms.sort()
241 for atom in atoms:
242 magmom=ground_state_magnetic_moments[atomic_numbers[atom]]
243 data[atom] = {
244 'database': database,
245 'name': atom,
246 'symbols': atom,
247 'magmoms': [magmom], # None or list
248 'charges': None, # None or list
249 'positions': np.array([[0.0]*3]),
250 }
251 Atom(atom, magmom=magmom)
252 pprint.pprint(data, stream=fh)
253 fh.close()
254
255if __name__ == '__main__':
256 main()