PageRenderTime 81ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 1ms

/ase/data/tmxr200x.py

https://bitbucket.org/alexei-matveev/ase-local
Python | 256 lines | 218 code | 16 blank | 22 comment | 30 complexity | 5ddde2b39339af8a11bd81693e80a624 MD5 | raw file
  1. import os
  2. import pprint
  3. import re
  4. from urllib import urlretrieve
  5. import zipfile
  6. import datetime
  7. import numpy as np
  8. from ase.utils import popen3
  9. import ase.io
  10. from ase.atom import Atom
  11. from ase.atoms import Atoms
  12. from ase.data import atomic_numbers, chemical_symbols
  13. from ase.data import ground_state_magnetic_moments
  14. # Transition Metals First-row (TM1R): 10.1021/ct6001187 # 32 compounds
  15. # Transition Metals Second-row (TM2R): 10.1021/ct700178y # 19 compounds
  16. # Transition Metals Third-row (TM3R): 10.1021/ct800172j # 25 compounds
  17. #http://pubs.acs.org/doi/suppl/10.1021/ct6001187/suppl_file/ct6001187-file002.pdf
  18. #http://pubs.acs.org/doi/suppl/10.1021/ct700178y/suppl_file/ct700178y-file002.pdf
  19. #http://pubs.acs.org/doi/suppl/10.1021/ct800172j/suppl_file/ct800172j_si_001.pdf
  20. url_root = 'http://pubs.acs.org/doi/suppl/'
  21. journal = '10.1021'
  22. database_files = {
  23. 'TM1R2006': {'doi': journal + '/ct6001187', 'module': 'TMXR200X_TM1R2006'},
  24. 'TM2R2007': {'doi': journal + '/ct700178y', 'module': 'TMXR200X_TM2R2007'},
  25. 'TM3R2008': {'doi': journal + '/ct800172j', 'module': 'TMXR200X_TM3R2008'},
  26. }
  27. database_files['TM1R2006']['pdf'] = database_files['TM1R2006']['doi'] + '/suppl_file/ct6001187-file002.pdf'
  28. database_files['TM2R2007']['pdf'] = database_files['TM2R2007']['doi'] + '/suppl_file/ct700178y-file002.pdf'
  29. database_files['TM3R2008']['pdf'] = database_files['TM3R2008']['doi'] + '/suppl_file/ct800172j_si_001.pdf'
  30. def download_file(url, filename, dir='.'):
  31. # do not mirror subdirectory structure of url
  32. outfile = os.path.join(dir, os.path.basename(filename))
  33. if 0: # fails, use files from disk
  34. urlretrieve(os.path.join(url, filename), outfile)
  35. return outfile
  36. def read_geometries(filename, dir='.'):
  37. txt = os.path.join(dir, filename)
  38. fh = open(txt, 'rb')
  39. table = fh.read()
  40. firstsplit = '(in xyz format):' # TM1R2006 and TM2R2007
  41. dataformat = 'xyz'
  42. if table.find('(Gaussian archive entries):') != -1:
  43. firstsplit = '(Gaussian archive entries):' # TM3R2008
  44. dataformat = 'gaussian'
  45. table = table.split(firstsplit)
  46. table = table[1]
  47. # remove one or two digit numbers (page numbers/numbers of atoms in xyz format)
  48. table = re.sub('\n\d\d\n', '\n', table)
  49. table = re.sub('\n\d\n', '\n', table)
  50. # remove S + two digit numbers (page numbers)
  51. table = re.sub('\nS\d\d\n', '\n', table)
  52. # remove S + one digit (page numbers)
  53. table = re.sub('\nS\d\n', '\n', table)
  54. # remove empty lines
  55. # http://stackoverflow.com/questions/1140958/whats-a-quick-one-liner-to-remove-empty-lines-from-a-python-string
  56. table = os.linesep.join([s for s in table.splitlines() if s])
  57. geometries = []
  58. if dataformat == 'xyz':
  59. # split on new lines
  60. table = table.split('\n')
  61. # mark compound names with ':' tags
  62. for n, line in enumerate(table):
  63. if not (line.find('.') != -1):
  64. # remove method/basis set information
  65. table[n] = table[n].replace(' BP86/qzvp', '')
  66. table[n] = ':' + table[n] + ':'
  67. table = '\n'.join([s for s in table])
  68. # split into compounds
  69. # http://simonwillison.net/2003/Oct/26/reSplit/
  70. # http://stackoverflow.com/questions/647655/python-regex-split-and-special-character
  71. table = re.compile('(:.*:)').split(table)
  72. # remove empty elements
  73. table = [l.strip() for l in table]
  74. table = [l for l in table if len(l) > 1]
  75. # extract compounds
  76. for n in range(0, len(table), 2):
  77. compound = table[n].replace(':', '').replace(' ', '_')
  78. geometry = []
  79. for atom in table[n+1].split('\n'):
  80. geometry.append(Atom(symbol=atom.split()[0], position=atom.split()[1:]))
  81. atoms = Atoms(geometry)
  82. # set the charge and magnetic moment on the heaviest atom (better ideas?)
  83. heaviest = max([a.get_atomic_number() for a in atoms])
  84. heaviest_index = [a.get_atomic_number() for a in atoms].index(heaviest)
  85. charge = 0.0
  86. if abs(charge) > 0.0:
  87. charges = [0.0 for a in atoms]
  88. charges[heaviest_index] = charge
  89. atoms.set_initial_charges(charges)
  90. if compound in [ # see corresponding articles
  91. 'Ti(BH4)3', # TM1R2006
  92. 'V(NMe2)4', # TM1R2006
  93. 'Cu(acac)2', # TM1R2006
  94. 'Nb(Cp)(C7H7)_Cs', # TM2R2007
  95. 'CdMe_C3v', # TM2R2007
  96. ]:
  97. multiplicity = 2.0
  98. else:
  99. multiplicity = 1.0
  100. if multiplicity > 1.0:
  101. magmoms = [0.0 for a in atoms]
  102. magmoms[heaviest_index] = multiplicity - 1
  103. atoms.set_initial_magnetic_moments(magmoms)
  104. geometries.append((compound, atoms))
  105. elif dataformat == 'gaussian':
  106. # remove new lines
  107. table = table.replace('\n', '')
  108. # fix: MeHg(Cl) written as MeHg(CN)
  109. table = table.replace(
  110. 'MeHg(CN), qzvp (SDD/def-qzvp for metal)\\\\0,1\\Hg,0.,0.,0.1975732257',
  111. 'MeHg(Cl), qzvp (SDD/def-qzvp for metal)\\\\0,1\\Hg,0.,0.,0.1975732257')
  112. # split on compound end marks
  113. table = table.split('\\\@')
  114. # remove empty elements
  115. table = [l.strip() for l in table]
  116. table = [l for l in table if len(l) > 1]
  117. # extract compounds
  118. for n, line in enumerate(table):
  119. # split on gaussian separator '\\'
  120. entries = line.split('\\\\')
  121. compound = entries[2].split(',')[0].split(' ')[0]
  122. # charge and multiplicity from gaussian archive
  123. charge, multiplicity = entries[3].split('\\')[0].split(',')
  124. charge = float(charge)
  125. multiplicity = float(multiplicity)
  126. if compound in ['Au(Me)PMe3']: # in gzmat format!
  127. # check openbabel version (babel >= 2.2 needed)
  128. cmd = popen3('babel -V')[1]
  129. output = cmd.read().strip()
  130. cmd.close()
  131. v1, v2, v3 = output.split()[2].split('.')
  132. v1, v2, v3 = int(v1), int(v2), int(v3)
  133. if not (v1 > 2 or ((v1 == 2) and (v2 >= 2))):
  134. print compound + ': skipped - version of babel does not support gzmat format'
  135. continue # this one is given in z-matrix format
  136. finame = compound.replace('(', '').replace(')', '') + '.orig'
  137. foname = finame.split('.')[0] + '.xyz'
  138. fi = open(finame, 'w')
  139. fo = open(foname, 'w')
  140. if 1: # how to extract zmat by hand
  141. zmat = ['#'] # must start with gaussian input start
  142. zmat.extend('@') # separated by newline
  143. zmat.extend([compound])
  144. zmat.extend('@') # separated by newline
  145. zmat.extend([str(int(charge)) + ' ' + str(int(multiplicity))])
  146. zmat.extend(entries[3].replace(',', ' ').split('\\')[1:])
  147. zmat.extend('@') # atom and variable definitions separated by newline
  148. zmat.extend(entries[4].split('\\'))
  149. zmat.extend('@') # end with newline
  150. for l in zmat:
  151. fi.write(l.replace('@', '').replace('=', ' ') + '\n')
  152. fi.close()
  153. if 0:
  154. # or use the whole gausian archive entry
  155. entries = ''.join(entries)
  156. fi.write(entries)
  157. # convert gzmat into xyz using openbabel (babel >= 2.2 needed)
  158. cmd = popen3('babel -i gzmat ' + finame + ' -o xyz ' + foname)[2]
  159. error = cmd.read().strip()
  160. cmd.close()
  161. fo.close()
  162. if not (error.find('0 molecules') != -1):
  163. atoms = ase.io.read(foname)
  164. else:
  165. print compound + ': babel conversion failed'
  166. continue # conversion failed
  167. else:
  168. positions = entries[3].replace(',', ' ').split('\\')[1:]
  169. geometry = []
  170. for k, atom in enumerate(positions):
  171. geometry.append(Atom(symbol=atom.split()[0],
  172. position=[float(p) for p in atom.split()[1:]]))
  173. atoms = Atoms(geometry)
  174. #
  175. # set the charge and magnetic moment on the heaviest atom (better ideas?)
  176. heaviest = max([a.get_atomic_number() for a in atoms])
  177. heaviest_index = [a.get_atomic_number() for a in atoms].index(heaviest)
  178. if abs(charge) > 0.0:
  179. charges = [0.0 for a in atoms]
  180. charges[heaviest_index] = charge
  181. atoms.set_initial_charges(charges)
  182. if multiplicity > 1.0:
  183. magmoms = [0.0 for a in atoms]
  184. magmoms[heaviest_index] = multiplicity - 1
  185. atoms.set_initial_magnetic_moments(magmoms)
  186. geometries.append((compound, atoms))
  187. return geometries
  188. def pdftotext(filename):
  189. os.system('pdftotext -raw -nopgbrk '+ filename)
  190. return os.path.splitext(filename)[0] + '.txt'
  191. from ase.data.gmtkn30 import format_data
  192. def main():
  193. if not os.path.isdir('TMXR200X'):
  194. os.makedirs('TMXR200X')
  195. #for database in ['TM1R2006']:
  196. for database in database_files.keys():
  197. fh = open(database_files[database]['module'].lower() + '.py', 'w')
  198. fh.write('# Computer generated code! Hands off!\n')
  199. fh.write('# Generated: ' + str(datetime.date.today()) + '\n')
  200. fh.write('from numpy import array\n')
  201. fh.write('data = ')
  202. data = {} # specification of molecules
  203. info = {} # reference/calculation info
  204. # download structures
  205. file = database_files[database]['pdf']
  206. f = os.path.abspath(download_file(url_root, file, dir='TMXR200X'))
  207. f = pdftotext(f)
  208. geometries = read_geometries(f)
  209. # set number of unpaired electrons and charges
  210. no_unpaired_electrons = []
  211. charges = []
  212. for a in geometries:
  213. magmom = sum(a[1].get_initial_magnetic_moments())
  214. if magmom > 0.0:
  215. no_unpaired_electrons.append((a[0], magmom))
  216. charge = sum(a[1].get_charges())
  217. if abs(charge) > 0.0:
  218. charges.append((a[0], charge))
  219. data = format_data(database, geometries, no_unpaired_electrons, charges)
  220. # all constituent atoms
  221. atoms = []
  222. for formula, geometry in geometries:
  223. atoms.extend(list(set(geometry.get_chemical_symbols())))
  224. atoms=list(set(atoms))
  225. atoms.sort()
  226. for atom in atoms:
  227. magmom=ground_state_magnetic_moments[atomic_numbers[atom]]
  228. data[atom] = {
  229. 'database': database,
  230. 'name': atom,
  231. 'symbols': atom,
  232. 'magmoms': [magmom], # None or list
  233. 'charges': None, # None or list
  234. 'positions': np.array([[0.0]*3]),
  235. }
  236. Atom(atom, magmom=magmom)
  237. pprint.pprint(data, stream=fh)
  238. fh.close()
  239. if __name__ == '__main__':
  240. main()