PageRenderTime 51ms CodeModel.GetById 21ms app.highlight 25ms RepoModel.GetById 1ms app.codeStats 0ms

/ase/data/tmxr200x.py

https://bitbucket.org/alexei-matveev/ase-local
Python | 256 lines | 218 code | 16 blank | 22 comment | 30 complexity | 5ddde2b39339af8a11bd81693e80a624 MD5 | raw file
  1import os
  2import pprint
  3import re
  4from urllib import urlretrieve
  5import zipfile
  6
  7import datetime
  8
  9import numpy as np
 10
 11from ase.utils import popen3
 12
 13import ase.io
 14from ase.atom import Atom
 15from ase.atoms import Atoms
 16from ase.data import atomic_numbers, chemical_symbols
 17from ase.data import ground_state_magnetic_moments
 18
 19# Transition Metals First-row (TM1R): 10.1021/ct6001187 # 32 compounds
 20# Transition Metals Second-row (TM2R): 10.1021/ct700178y # 19 compounds
 21# Transition Metals Third-row (TM3R): 10.1021/ct800172j # 25 compounds
 22
 23#http://pubs.acs.org/doi/suppl/10.1021/ct6001187/suppl_file/ct6001187-file002.pdf
 24#http://pubs.acs.org/doi/suppl/10.1021/ct700178y/suppl_file/ct700178y-file002.pdf
 25#http://pubs.acs.org/doi/suppl/10.1021/ct800172j/suppl_file/ct800172j_si_001.pdf
 26
 27url_root = 'http://pubs.acs.org/doi/suppl/'
 28journal = '10.1021'
 29database_files = {
 30    'TM1R2006': {'doi': journal + '/ct6001187', 'module': 'TMXR200X_TM1R2006'},
 31    'TM2R2007': {'doi': journal + '/ct700178y', 'module': 'TMXR200X_TM2R2007'},
 32    'TM3R2008': {'doi': journal + '/ct800172j', 'module': 'TMXR200X_TM3R2008'},
 33    }
 34
 35database_files['TM1R2006']['pdf'] = database_files['TM1R2006']['doi'] + '/suppl_file/ct6001187-file002.pdf'
 36
 37database_files['TM2R2007']['pdf'] = database_files['TM2R2007']['doi'] + '/suppl_file/ct700178y-file002.pdf'
 38
 39database_files['TM3R2008']['pdf'] = database_files['TM3R2008']['doi'] + '/suppl_file/ct800172j_si_001.pdf'
 40
 41def download_file(url, filename, dir='.'):
 42    # do not mirror subdirectory structure of url
 43    outfile = os.path.join(dir, os.path.basename(filename))
 44    if 0: # fails, use files from disk
 45        urlretrieve(os.path.join(url, filename), outfile)
 46    return outfile
 47
 48def read_geometries(filename, dir='.'):
 49    txt = os.path.join(dir, filename)
 50    fh = open(txt, 'rb')
 51    table = fh.read()
 52    firstsplit = '(in xyz format):' # TM1R2006 and TM2R2007
 53    dataformat = 'xyz'
 54    if table.find('(Gaussian archive entries):') != -1:
 55        firstsplit = '(Gaussian archive entries):' # TM3R2008
 56        dataformat = 'gaussian'
 57    table = table.split(firstsplit)
 58    table = table[1]
 59    # remove one or two digit numbers (page numbers/numbers of atoms in xyz format)
 60    table = re.sub('\n\d\d\n', '\n', table)
 61    table = re.sub('\n\d\n', '\n', table)
 62    # remove S + two digit numbers (page numbers)
 63    table = re.sub('\nS\d\d\n', '\n', table)
 64    # remove S + one digit (page numbers)
 65    table = re.sub('\nS\d\n', '\n', table)
 66    # remove empty lines
 67    # http://stackoverflow.com/questions/1140958/whats-a-quick-one-liner-to-remove-empty-lines-from-a-python-string
 68    table = os.linesep.join([s for s in table.splitlines() if s])
 69    geometries = []
 70    if dataformat == 'xyz':
 71        # split on new lines
 72        table = table.split('\n')
 73        # mark compound names with ':' tags
 74        for n, line in enumerate(table):
 75            if not (line.find('.') != -1):
 76                # remove method/basis set information
 77                table[n] = table[n].replace(' BP86/qzvp', '')
 78                table[n] = ':' + table[n] + ':'
 79        table = '\n'.join([s for s in table])
 80        # split into compounds
 81        # http://simonwillison.net/2003/Oct/26/reSplit/
 82        # http://stackoverflow.com/questions/647655/python-regex-split-and-special-character
 83        table = re.compile('(:.*:)').split(table)
 84        # remove empty elements
 85        table = [l.strip() for l in table]
 86        table = [l for l in table if len(l) > 1]
 87        # extract compounds
 88        for n in range(0, len(table), 2):
 89            compound = table[n].replace(':', '').replace(' ', '_')
 90            geometry = []
 91            for atom in table[n+1].split('\n'):
 92                geometry.append(Atom(symbol=atom.split()[0], position=atom.split()[1:]))
 93            atoms = Atoms(geometry)
 94            # set the charge and magnetic moment on the heaviest atom (better ideas?)
 95            heaviest = max([a.get_atomic_number() for a in atoms])
 96            heaviest_index = [a.get_atomic_number() for a in atoms].index(heaviest)
 97            charge = 0.0
 98            if abs(charge) > 0.0:
 99                charges = [0.0 for a in atoms]
100                charges[heaviest_index] = charge
101                atoms.set_initial_charges(charges)
102            if compound in [ # see corresponding articles
103                'Ti(BH4)3',  # TM1R2006
104                'V(NMe2)4',  # TM1R2006
105                'Cu(acac)2',  # TM1R2006
106                'Nb(Cp)(C7H7)_Cs', # TM2R2007
107                'CdMe_C3v', # TM2R2007
108                ]:
109                multiplicity = 2.0
110            else:
111                multiplicity = 1.0
112            if multiplicity > 1.0:
113                magmoms = [0.0 for a in atoms]
114                magmoms[heaviest_index] = multiplicity - 1
115                atoms.set_initial_magnetic_moments(magmoms)
116            geometries.append((compound, atoms))
117    elif dataformat == 'gaussian':
118        # remove new lines
119        table = table.replace('\n', '')
120        # fix: MeHg(Cl) written as MeHg(CN)
121        table = table.replace(
122            'MeHg(CN), qzvp (SDD/def-qzvp for metal)\\\\0,1\\Hg,0.,0.,0.1975732257',
123            'MeHg(Cl), qzvp (SDD/def-qzvp for metal)\\\\0,1\\Hg,0.,0.,0.1975732257')
124        # split on compound end marks
125        table = table.split('\\\@')
126        # remove empty elements
127        table = [l.strip() for l in table]
128        table = [l for l in table if len(l) > 1]
129        # extract compounds
130        for n, line in enumerate(table):
131            # split on gaussian separator '\\'
132            entries = line.split('\\\\')
133            compound = entries[2].split(',')[0].split(' ')[0]
134            # charge and multiplicity from gaussian archive
135            charge, multiplicity = entries[3].split('\\')[0].split(',')
136            charge = float(charge)
137            multiplicity = float(multiplicity)
138            if compound in ['Au(Me)PMe3']: # in gzmat format!
139                # check openbabel version (babel >= 2.2 needed)
140                cmd = popen3('babel -V')[1]
141                output = cmd.read().strip()
142                cmd.close()
143                v1, v2, v3 = output.split()[2].split('.')
144                v1, v2, v3 = int(v1), int(v2), int(v3)
145                if not (v1 > 2 or ((v1 == 2) and (v2 >= 2))):
146                    print compound + ': skipped - version of babel does not support gzmat format'
147                    continue # this one is given in z-matrix format
148                finame = compound.replace('(', '').replace(')', '') + '.orig'
149                foname = finame.split('.')[0] + '.xyz'
150                fi = open(finame, 'w')
151                fo = open(foname, 'w')
152                if 1: # how to extract zmat by hand
153                    zmat = ['#'] # must start with gaussian input start
154                    zmat.extend('@') # separated by newline
155                    zmat.extend([compound])
156                    zmat.extend('@') # separated by newline
157                    zmat.extend([str(int(charge)) + ' ' + str(int(multiplicity))])
158                    zmat.extend(entries[3].replace(',', ' ').split('\\')[1:])
159                    zmat.extend('@') # atom and variable definitions separated by newline
160                    zmat.extend(entries[4].split('\\'))
161                    zmat.extend('@') # end with newline
162                    for l in zmat:
163                        fi.write(l.replace('@', '').replace('=', ' ') + '\n')
164                    fi.close()
165                if 0:
166                    # or use the whole gausian archive entry
167                    entries = ''.join(entries)
168                    fi.write(entries)
169                # convert gzmat into xyz using openbabel (babel >= 2.2 needed)
170                cmd = popen3('babel -i gzmat ' + finame + ' -o xyz ' + foname)[2]
171                error = cmd.read().strip()
172                cmd.close()
173                fo.close()
174                if not (error.find('0 molecules') != -1):
175                    atoms = ase.io.read(foname)
176                else:
177                    print compound + ': babel conversion failed'
178                    continue # conversion failed
179            else:
180                positions = entries[3].replace(',', ' ').split('\\')[1:]
181                geometry = []
182                for k, atom in enumerate(positions):
183                    geometry.append(Atom(symbol=atom.split()[0],
184                                         position=[float(p) for p in atom.split()[1:]]))
185                atoms = Atoms(geometry)
186            #
187            # set the charge and magnetic moment on the heaviest atom (better ideas?)
188            heaviest = max([a.get_atomic_number() for a in atoms])
189            heaviest_index = [a.get_atomic_number() for a in atoms].index(heaviest)
190            if abs(charge) > 0.0:
191                charges = [0.0 for a in atoms]
192                charges[heaviest_index] = charge
193                atoms.set_initial_charges(charges)
194            if multiplicity > 1.0:
195                magmoms = [0.0 for a in atoms]
196                magmoms[heaviest_index] = multiplicity - 1
197                atoms.set_initial_magnetic_moments(magmoms)
198            geometries.append((compound, atoms))
199    return geometries
200
201def pdftotext(filename):
202    os.system('pdftotext -raw -nopgbrk '+ filename)
203    return os.path.splitext(filename)[0] + '.txt'
204
205from ase.data.gmtkn30 import format_data
206
207def main():
208    if not os.path.isdir('TMXR200X'):
209        os.makedirs('TMXR200X')
210    #for database in ['TM1R2006']:
211    for database in database_files.keys():
212        fh = open(database_files[database]['module'].lower() + '.py', 'w')
213        fh.write('# Computer generated code! Hands off!\n')
214        fh.write('# Generated: ' + str(datetime.date.today()) + '\n')
215        fh.write('from numpy import array\n')
216        fh.write('data = ')
217        data = {} # specification of molecules
218        info = {} # reference/calculation info
219        # download structures
220        file = database_files[database]['pdf']
221        f = os.path.abspath(download_file(url_root, file, dir='TMXR200X'))
222        f = pdftotext(f)
223        geometries = read_geometries(f)
224        # set number of unpaired electrons and charges
225        no_unpaired_electrons = []
226        charges = []
227        for a in geometries:
228            magmom = sum(a[1].get_initial_magnetic_moments())
229            if magmom > 0.0:
230                no_unpaired_electrons.append((a[0], magmom))
231            charge = sum(a[1].get_charges())
232            if abs(charge) > 0.0:
233                charges.append((a[0], charge))
234        data = format_data(database, geometries, no_unpaired_electrons, charges)
235        # all constituent atoms
236        atoms = []
237        for formula, geometry in geometries:
238            atoms.extend(list(set(geometry.get_chemical_symbols())))
239        atoms=list(set(atoms))
240        atoms.sort()
241        for atom in atoms:
242            magmom=ground_state_magnetic_moments[atomic_numbers[atom]]
243            data[atom] = {
244                'database': database,
245                'name': atom,
246                'symbols': atom,
247                'magmoms': [magmom], # None or list
248                'charges': None, # None or list
249                'positions': np.array([[0.0]*3]),
250                }
251            Atom(atom, magmom=magmom)
252        pprint.pprint(data, stream=fh)
253        fh.close()
254
255if __name__ == '__main__':
256    main()