PageRenderTime 4269ms CodeModel.GetById 30ms RepoModel.GetById 1ms app.codeStats 0ms

/ase/data/tmxr200x.py

https://gitlab.com/jmlorenzi/ase
Python | 254 lines | 216 code | 16 blank | 22 comment | 30 complexity | afdfa241e25a8954c6a06ee55a060979 MD5 | raw file
  1. from __future__ import print_function
  2. import os
  3. import pprint
  4. import re
  5. from urllib.request import urlretrieve
  6. import datetime
  7. import numpy as np
  8. from ase.utils import popen3
  9. import ase.io
  10. from ase.atom import Atom
  11. from ase.atoms import Atoms
  12. from ase.data import atomic_numbers
  13. from ase.data import ground_state_magnetic_moments
  14. # Transition Metals First-row (TM1R): 10.1021/ct6001187 # 32 compounds
  15. # Transition Metals Second-row (TM2R): 10.1021/ct700178y # 19 compounds
  16. # Transition Metals Third-row (TM3R): 10.1021/ct800172j # 25 compounds
  17. #http://pubs.acs.org/doi/suppl/10.1021/ct6001187/suppl_file/ct6001187-file002.pdf
  18. #http://pubs.acs.org/doi/suppl/10.1021/ct700178y/suppl_file/ct700178y-file002.pdf
  19. #http://pubs.acs.org/doi/suppl/10.1021/ct800172j/suppl_file/ct800172j_si_001.pdf
  20. url_root = 'http://pubs.acs.org/doi/suppl/'
  21. journal = '10.1021'
  22. database_files = {
  23. 'TM1R2006': {'doi': journal + '/ct6001187', 'module': 'TMXR200X_TM1R2006'},
  24. 'TM2R2007': {'doi': journal + '/ct700178y', 'module': 'TMXR200X_TM2R2007'},
  25. 'TM3R2008': {'doi': journal + '/ct800172j', 'module': 'TMXR200X_TM3R2008'},
  26. }
  27. database_files['TM1R2006']['pdf'] = database_files['TM1R2006']['doi'] + '/suppl_file/ct6001187-file002.pdf'
  28. database_files['TM2R2007']['pdf'] = database_files['TM2R2007']['doi'] + '/suppl_file/ct700178y-file002.pdf'
  29. database_files['TM3R2008']['pdf'] = database_files['TM3R2008']['doi'] + '/suppl_file/ct800172j_si_001.pdf'
  30. def download_file(url, filename, dir='.'):
  31. # do not mirror subdirectory structure of url
  32. outfile = os.path.join(dir, os.path.basename(filename))
  33. if 0: # fails, use files from disk
  34. urlretrieve(os.path.join(url, filename), outfile)
  35. return outfile
  36. def read_geometries(filename, dir='.'):
  37. txt = os.path.join(dir, filename)
  38. fh = open(txt, 'rb')
  39. table = fh.read()
  40. firstsplit = '(in xyz format):' # TM1R2006 and TM2R2007
  41. dataformat = 'xyz'
  42. if table.find('(Gaussian archive entries):') != -1:
  43. firstsplit = '(Gaussian archive entries):' # TM3R2008
  44. dataformat = 'gaussian'
  45. table = table.split(firstsplit)
  46. table = table[1]
  47. # remove one or two digit numbers (page numbers/numbers of atoms in xyz format)
  48. table = re.sub('\n\d\d\n', '\n', table)
  49. table = re.sub('\n\d\n', '\n', table)
  50. # remove S + two digit numbers (page numbers)
  51. table = re.sub('\nS\d\d\n', '\n', table)
  52. # remove S + one digit (page numbers)
  53. table = re.sub('\nS\d\n', '\n', table)
  54. # remove empty lines
  55. # http://stackoverflow.com/questions/1140958/whats-a-quick-one-liner-to-remove-empty-lines-from-a-python-string
  56. table = os.linesep.join([s for s in table.splitlines() if s])
  57. geometries = []
  58. if dataformat == 'xyz':
  59. # split on new lines
  60. table = table.split('\n')
  61. # mark compound names with ':' tags
  62. for n, line in enumerate(table):
  63. if not (line.find('.') != -1):
  64. # remove method/basis set information
  65. table[n] = table[n].replace(' BP86/qzvp', '')
  66. table[n] = ':' + table[n] + ':'
  67. table = '\n'.join([s for s in table])
  68. # split into compounds
  69. # http://simonwillison.net/2003/Oct/26/reSplit/
  70. # http://stackoverflow.com/questions/647655/python-regex-split-and-special-character
  71. table = re.compile('(:.*:)').split(table)
  72. # remove empty elements
  73. table = [l.strip() for l in table]
  74. table = [l for l in table if len(l) > 1]
  75. # extract compounds
  76. for n in range(0, len(table), 2):
  77. compound = table[n].replace(':', '').replace(' ', '_')
  78. geometry = []
  79. for atom in table[n+1].split('\n'):
  80. geometry.append(Atom(symbol=atom.split()[0], position=atom.split()[1:]))
  81. atoms = Atoms(geometry)
  82. # set the charge and magnetic moment on the heaviest atom (better ideas?)
  83. heaviest = max([a.get_atomic_number() for a in atoms])
  84. heaviest_index = [a.get_atomic_number() for a in atoms].index(heaviest)
  85. charge = 0.0
  86. if abs(charge) > 0.0:
  87. charges = [0.0 for a in atoms]
  88. charges[heaviest_index] = charge
  89. atoms.set_initial_charges(charges)
  90. if compound in [ # see corresponding articles
  91. 'Ti(BH4)3', # TM1R2006
  92. 'V(NMe2)4', # TM1R2006
  93. 'Cu(acac)2', # TM1R2006
  94. 'Nb(Cp)(C7H7)_Cs', # TM2R2007
  95. 'CdMe_C3v', # TM2R2007
  96. ]:
  97. multiplicity = 2.0
  98. else:
  99. multiplicity = 1.0
  100. if multiplicity > 1.0:
  101. magmoms = [0.0 for a in atoms]
  102. magmoms[heaviest_index] = multiplicity - 1
  103. atoms.set_initial_magnetic_moments(magmoms)
  104. geometries.append((compound, atoms))
  105. elif dataformat == 'gaussian':
  106. # remove new lines
  107. table = table.replace('\n', '')
  108. # fix: MeHg(Cl) written as MeHg(CN)
  109. table = table.replace(
  110. 'MeHg(CN), qzvp (SDD/def-qzvp for metal)\\\\0,1\\Hg,0.,0.,0.1975732257',
  111. 'MeHg(Cl), qzvp (SDD/def-qzvp for metal)\\\\0,1\\Hg,0.,0.,0.1975732257')
  112. # split on compound end marks
  113. table = table.split('\\\@')
  114. # remove empty elements
  115. table = [l.strip() for l in table]
  116. table = [l for l in table if len(l) > 1]
  117. # extract compounds
  118. for n, line in enumerate(table):
  119. # split on gaussian separator '\\'
  120. entries = line.split('\\\\')
  121. compound = entries[2].split(',')[0].split(' ')[0]
  122. # charge and multiplicity from gaussian archive
  123. charge, multiplicity = entries[3].split('\\')[0].split(',')
  124. charge = float(charge)
  125. multiplicity = float(multiplicity)
  126. if compound in ['Au(Me)PMe3']: # in gzmat format!
  127. # check openbabel version (babel >= 2.2 needed)
  128. cmd = popen3('babel -V')[1]
  129. output = cmd.read().strip()
  130. cmd.close()
  131. v1, v2, v3 = output.split()[2].split('.')
  132. v1, v2, v3 = int(v1), int(v2), int(v3)
  133. if not (v1 > 2 or ((v1 == 2) and (v2 >= 2))):
  134. print(compound + ': skipped - version of babel does not support gzmat format')
  135. continue # this one is given in z-matrix format
  136. finame = compound.replace('(', '').replace(')', '') + '.orig'
  137. foname = finame.split('.')[0] + '.xyz'
  138. fi = open(finame, 'w')
  139. fo = open(foname, 'w')
  140. if 1: # how to extract zmat by hand
  141. zmat = ['#'] # must start with gaussian input start
  142. zmat.extend('@') # separated by newline
  143. zmat.extend([compound])
  144. zmat.extend('@') # separated by newline
  145. zmat.extend([str(int(charge)) + ' ' + str(int(multiplicity))])
  146. zmat.extend(entries[3].replace(',', ' ').split('\\')[1:])
  147. zmat.extend('@') # atom and variable definitions separated by newline
  148. zmat.extend(entries[4].split('\\'))
  149. zmat.extend('@') # end with newline
  150. for l in zmat:
  151. fi.write(l.replace('@', '').replace('=', ' ') + '\n')
  152. fi.close()
  153. if 0:
  154. # or use the whole gausian archive entry
  155. entries = ''.join(entries)
  156. fi.write(entries)
  157. # convert gzmat into xyz using openbabel (babel >= 2.2 needed)
  158. cmd = popen3('babel -i gzmat ' + finame + ' -o xyz ' + foname)[2]
  159. error = cmd.read().strip()
  160. cmd.close()
  161. fo.close()
  162. if not (error.find('0 molecules') != -1):
  163. atoms = ase.io.read(foname)
  164. else:
  165. print(compound + ': babel conversion failed')
  166. continue # conversion failed
  167. else:
  168. positions = entries[3].replace(',', ' ').split('\\')[1:]
  169. geometry = []
  170. for k, atom in enumerate(positions):
  171. geometry.append(Atom(symbol=atom.split()[0],
  172. position=[float(p) for p in atom.split()[1:]]))
  173. atoms = Atoms(geometry)
  174. #
  175. # set the charge and magnetic moment on the heaviest atom (better ideas?)
  176. heaviest = max([a.get_atomic_number() for a in atoms])
  177. heaviest_index = [a.get_atomic_number() for a in atoms].index(heaviest)
  178. if abs(charge) > 0.0:
  179. charges = [0.0 for a in atoms]
  180. charges[heaviest_index] = charge
  181. atoms.set_initial_charges(charges)
  182. if multiplicity > 1.0:
  183. magmoms = [0.0 for a in atoms]
  184. magmoms[heaviest_index] = multiplicity - 1
  185. atoms.set_initial_magnetic_moments(magmoms)
  186. geometries.append((compound, atoms))
  187. return geometries
  188. def pdftotext(filename):
  189. os.system('pdftotext -raw -nopgbrk '+ filename)
  190. return os.path.splitext(filename)[0] + '.txt'
  191. from ase.data.gmtkn30 import format_data
  192. def main():
  193. if not os.path.isdir('TMXR200X'):
  194. os.makedirs('TMXR200X')
  195. #for database in ['TM1R2006']:
  196. for database in database_files.keys():
  197. fh = open(database_files[database]['module'].lower() + '.py', 'w')
  198. fh.write('# Computer generated code! Hands off!\n')
  199. fh.write('# Generated: ' + str(datetime.date.today()) + '\n')
  200. fh.write('from numpy import array\n')
  201. fh.write('data = ')
  202. data = {} # specification of molecules
  203. # download structures
  204. file = database_files[database]['pdf']
  205. f = os.path.abspath(download_file(url_root, file, dir='TMXR200X'))
  206. f = pdftotext(f)
  207. geometries = read_geometries(f)
  208. # set number of unpaired electrons and charges
  209. no_unpaired_electrons = []
  210. charges = []
  211. for a in geometries:
  212. magmom = sum(a[1].get_initial_magnetic_moments())
  213. if magmom > 0.0:
  214. no_unpaired_electrons.append((a[0], magmom))
  215. charge = sum(a[1].get_charges())
  216. if abs(charge) > 0.0:
  217. charges.append((a[0], charge))
  218. data = format_data(database, geometries, no_unpaired_electrons, charges)
  219. # all constituent atoms
  220. atoms = []
  221. for formula, geometry in geometries:
  222. atoms.extend(list(set(geometry.get_chemical_symbols())))
  223. atoms=sorted(set(atoms))
  224. for atom in atoms:
  225. magmom=ground_state_magnetic_moments[atomic_numbers[atom]]
  226. data[atom] = {
  227. 'database': database,
  228. 'name': atom,
  229. 'symbols': atom,
  230. 'magmoms': [magmom], # None or list
  231. 'charges': None, # None or list
  232. 'positions': np.array([[0.0]*3]),
  233. }
  234. Atom(atom, magmom=magmom)
  235. pprint.pprint(data, stream=fh)
  236. fh.close()
  237. if __name__ == '__main__':
  238. main()