PageRenderTime 67ms CodeModel.GetById 30ms RepoModel.GetById 0ms app.codeStats 0ms

/src/python/WMCore/Services/DBS/DBSWriterObjects.py

https://github.com/PerilousApricot/WMCore
Python | 451 lines | 405 code | 10 blank | 36 comment | 9 complexity | 0a89aa626b91ed3333d86fdb5c2308b2 MD5 | raw file
  1. #!/usr/bin/env python
  2. """
  3. _DBSWriterObjects_
  4. Functions to instantiate and return DBS Objects and insert them
  5. into DBS if required
  6. """
  7. import logging
  8. from DBSAPI.dbsApi import DbsApi
  9. from DBSAPI.dbsException import *
  10. from DBSAPI.dbsApiException import *
  11. from DBSAPI.dbsPrimaryDataset import DbsPrimaryDataset
  12. from DBSAPI.dbsAlgorithm import DbsAlgorithm
  13. from DBSAPI.dbsQueryableParameterSet import DbsQueryableParameterSet
  14. from DBSAPI.dbsProcessedDataset import DbsProcessedDataset
  15. from DBSAPI.dbsFile import DbsFile
  16. from DBSAPI.dbsFileBlock import DbsFileBlock
  17. from DBSAPI.dbsStorageElement import DbsStorageElement
  18. from DBSAPI.dbsRun import DbsRun
  19. from DBSAPI.dbsLumiSection import DbsLumiSection
  20. def makeTierList(dataTier):
  21. """
  22. _makeTierList_
  23. Standard tool to split data tiers if they contain - chars
  24. *** Do not use outside of this module ***
  25. """
  26. tierList = dataTier.split("-")
  27. return tierList
  28. def createPrimaryDataset(datasetInfo, apiRef = None):
  29. """
  30. _createPrimaryDataset_
  31. Create and return a Primary Dataset object.
  32. If apiRef is not None, it is used to insert the dataset into the
  33. DBS
  34. """
  35. if datasetInfo.has_key('PrimaryDatasetType'):
  36. PrimaryDatasetType = datasetInfo['PrimaryDatasetType']
  37. else:
  38. PrimaryDatasetType = 'mc'
  39. logging.debug("Inserting PrimaryDataset %s with Type %s"%(datasetInfo["PrimaryDataset"],PrimaryDatasetType))
  40. primary = DbsPrimaryDataset(Name = datasetInfo["PrimaryDataset"], Type=PrimaryDatasetType)
  41. if apiRef != None:
  42. apiRef.insertPrimaryDataset(primary)
  43. return primary
  44. def createAlgorithm(datasetInfo, configMetadata = None, apiRef = None):
  45. """
  46. _createAlgorithm_
  47. Create an algorithm assuming that datasetInfo is a
  48. ProdCommon.MCPayloads.DatasetInfo like dictionary
  49. """
  50. exeName = datasetInfo['ApplicationName']
  51. appVersion = datasetInfo['ApplicationVersion']
  52. appFamily = datasetInfo["ApplicationFamily"]
  53. #
  54. # HACK: Problem with large PSets (is this still relevant ?)
  55. #
  56. # Repacker jobs have no PSetContent/PSetHash
  57. #
  58. psetContent = datasetInfo.get('PSetContent',None)
  59. if psetContent == None:
  60. psetContent = "PSET_CONTENT_NOT_AVAILABLE"
  61. psetHash = datasetInfo.get('PSetHash',None)
  62. if psetHash == None:
  63. psetHash = "NO_PSET_HASH"
  64. else:
  65. if psetHash.find(";"):
  66. # no need for fake hash in new schema
  67. psetHash = psetHash.split(";")[0]
  68. psetHash = psetHash.replace("hash=", "")
  69. ## No more hacks
  70. #msg = ">>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"
  71. #msg += "TEST HACK USED FOR PSetContent\n"
  72. #msg += ">>>>>>>>>>>>>>>>>>>>>>>>>>>>"
  73. #logging.warning(msg)
  74. #print msg
  75. #psetContent = "This is not a PSet"
  76. #
  77. # HACK: 100 char limit on cfg file name
  78. if configMetadata != None:
  79. cfgName = configMetadata['name']
  80. if len(cfgName) > 100:
  81. msg = ">>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"
  82. msg += "TEST HACK USED FOR Config File Name"
  83. msg += ">>>>>>>>>>>>>>>>>>>>>>>>>>>>"
  84. logging.warning(msg)
  85. print msg
  86. configMetadata['name'] = cfgName[-99]
  87. psetInstance = DbsQueryableParameterSet(
  88. Hash = psetHash,
  89. Name = configMetadata['name'],
  90. Version = configMetadata['version'],
  91. Type = configMetadata['Type'],
  92. Annotation = configMetadata['annotation'],
  93. Content = psetContent,
  94. )
  95. algorithmInstance = DbsAlgorithm(
  96. ExecutableName = exeName,
  97. ApplicationVersion = appVersion,
  98. ApplicationFamily = appFamily,
  99. ParameterSetID = psetInstance
  100. )
  101. else:
  102. psetInstance = DbsQueryableParameterSet(
  103. Hash = psetHash)
  104. algorithmInstance = DbsAlgorithm(
  105. ExecutableName = exeName,
  106. ApplicationVersion = appVersion,
  107. ApplicationFamily = appFamily,
  108. ParameterSetID = psetInstance
  109. )
  110. if apiRef != None:
  111. apiRef.insertAlgorithm(algorithmInstance)
  112. return algorithmInstance
  113. def createAlgorithmForInsert(datasetInfo):
  114. """
  115. _createPartialAlgorithm_
  116. Create an Algorithm instance that uses the minimal info needed
  117. to insert a file
  118. """
  119. exeName = datasetInfo['ApplicationName']
  120. appVersion = datasetInfo['ApplicationVersion']
  121. appFamily = datasetInfo["ApplicationFamily"]
  122. #
  123. # Repacker jobs have no PsetContent/PSetHash
  124. #
  125. psetContent = datasetInfo.get('PSetContent',None)
  126. if psetContent == None:
  127. psetContent = "PSET_CONTENT_NOT_AVAILABLE"
  128. psetHash = datasetInfo.get('PSetHash',None)
  129. if psetHash == None:
  130. psetHash = "NO_PSET_HASH"
  131. else:
  132. if psetHash.find(";"):
  133. # no need for fake hash in new schema
  134. psetHash = psetHash.split(";")[0]
  135. psetHash = psetHash.replace("hash=", "")
  136. psetInstance = DbsQueryableParameterSet(
  137. Hash = psetHash)
  138. algorithmInstance = DbsAlgorithm(
  139. ExecutableName = exeName,
  140. ApplicationVersion = appVersion,
  141. ApplicationFamily = appFamily,
  142. ParameterSetID = psetInstance
  143. )
  144. return algorithmInstance
  145. def createMergeAlgorithm(datasetInfo, apiRef = None):
  146. """
  147. _createMergeAlgorithm_
  148. Create a DbsAlgorithm for a merge dataset
  149. """
  150. exeName = datasetInfo['ApplicationName']
  151. version = datasetInfo['ApplicationVersion']
  152. family = datasetInfo.get('ApplicationFamily', None)
  153. if (family == None) or not (family) :
  154. family = datasetInfo['OutputModuleName']
  155. mergeAlgo = DbsAlgorithm (
  156. ExecutableName = exeName,
  157. ApplicationVersion = version,
  158. ApplicationFamily = family,
  159. )
  160. if apiRef != None:
  161. apiRef.insertAlgorithm(mergeAlgo)
  162. return mergeAlgo
  163. def createProcessedDataset(primaryDataset, algorithm, datasetInfo,
  164. apiRef = None):
  165. """
  166. _createProcessedDataset_
  167. """
  168. physicsGroup = datasetInfo.get("PhysicsGroup", "NoGroup")
  169. status = datasetInfo.get("Status", "VALID")
  170. dataTier = datasetInfo['DataTier']
  171. globalTag = datasetInfo.get('Conditions', None)
  172. if globalTag is None: globalTag = ''
  173. parents = []
  174. inputDataset = datasetInfo.get('ParentDataset', None)
  175. if inputDataset != None:
  176. parents.append(inputDataset)
  177. tierList = makeTierList(datasetInfo['DataTier'])
  178. name = datasetInfo['ProcessedDataset']
  179. algolist=[]
  180. if algorithm not in ('', None):
  181. algolist=list(algorithm)
  182. processedDataset = DbsProcessedDataset (
  183. PrimaryDataset = primaryDataset,
  184. AlgoList=algolist,
  185. Name = name,
  186. TierList = tierList,
  187. ParentList = parents,
  188. PhysicsGroup = physicsGroup,
  189. Status = status,
  190. GlobalTag = globalTag,
  191. )
  192. if apiRef != None:
  193. apiRef.insertProcessedDataset(processedDataset)
  194. #
  195. logging.debug("PrimaryDataset: %s ProcessedDataset: %s DataTierList: %s requested by PhysicsGroup: %s "%(primaryDataset['Name'],name,tierList,physicsGroup))
  196. return processedDataset
  197. def createDBSFiles(fjrFileInfo, jobType = None, apiRef = None):
  198. """
  199. _createDBSFiles_
  200. Create a list of DBS File instances from the file details contained
  201. in a FwkJobRep.FileInfo instance describing an output file
  202. Does not insert files, returns as list of DbsFile objects
  203. Does insert runs and lumisections if DBS API reference is passed
  204. """
  205. results = []
  206. inputLFNs = [ x['LFN'] for x in fjrFileInfo.inputFiles]
  207. checksum = fjrFileInfo.checksums['cksum']
  208. adler32sum = fjrFileInfo.checksums.get('adler32', '')
  209. nEvents = int(fjrFileInfo['TotalEvents'])
  210. if len(fjrFileInfo.dataset)<=0:
  211. logging.error("No dataset info found in FWJobReport!")
  212. return results
  213. # //
  214. # // Set FileType
  215. #//
  216. if fjrFileInfo.has_key('FileType'):
  217. fileType = fjrFileInfo['FileType']
  218. else:
  219. fileType = 'EDM'
  220. #
  221. # FIXME: at this point I should use the mc or data event type from
  222. # the jobreport. Until this is supported by the framework,
  223. # we use the workaround that mc job reports have an empty
  224. # lumisections list (stripped in DBSInterface)
  225. #
  226. lumiList = []
  227. if ( len(fjrFileInfo.getLumiSections()) > 0 ):
  228. #
  229. # insert runs (for data files from detector)
  230. #
  231. if ( apiRef != None ):
  232. for runinfo in fjrFileInfo.runs:
  233. run = DbsRun(
  234. RunNumber = long(runinfo),
  235. NumberOfEvents = 0,
  236. NumberOfLumiSections = 0,
  237. TotalLuminosity = 0,
  238. StoreNumber = 0,
  239. StartOfRun = 0,
  240. EndOfRun = 0,
  241. )
  242. apiRef.insertRun(run)
  243. #
  244. # insert lumisections (for data files from detector)
  245. # associate files with lumisections (for all data files)
  246. #
  247. for lumiinfo in fjrFileInfo.getLumiSections():
  248. lumi = DbsLumiSection(
  249. LumiSectionNumber = long(lumiinfo['LumiSectionNumber']),
  250. StartEventNumber = 0,
  251. EndEventNumber = 0,
  252. LumiStartTime = 0,
  253. LumiEndTime = 0,
  254. RunNumber = long(lumiinfo['RunNumber']),
  255. )
  256. # Isnt needed, causes monster slowdown
  257. #if ( apiRef != None ):
  258. # apiRef.insertLumiSection(lumi)
  259. lumiList.append(lumi)
  260. logging.debug("Lumi associated to file is: %s"%([x for x in lumiList]))
  261. # //
  262. # // Dataset info related to files and creation of DbsFile object
  263. #//
  264. for dataset in fjrFileInfo.dataset:
  265. primary = createPrimaryDataset(dataset)
  266. if jobType == "Merge":
  267. algo = createMergeAlgorithm(dataset)
  268. else:
  269. algo = createAlgorithmForInsert(dataset)
  270. processed = createProcessedDataset(primary, algo, dataset)
  271. dbsFileInstance = DbsFile(
  272. Checksum = checksum,
  273. Adler32 = adler32sum,
  274. NumberOfEvents = nEvents,
  275. LogicalFileName = fjrFileInfo['LFN'],
  276. FileSize = int(fjrFileInfo['Size']),
  277. Status = "VALID",
  278. ValidationStatus = 'VALID',
  279. FileType = fileType,
  280. Dataset = processed,
  281. TierList = makeTierList(dataset['DataTier']),
  282. AlgoList = [algo],
  283. LumiList = lumiList,
  284. ParentList = inputLFNs,
  285. BranchList = fjrFileInfo.branches,
  286. )
  287. results.append(dbsFileInstance)
  288. return results
  289. def createDBSStorageElement(seName):
  290. """
  291. _createDBSStorageElement_
  292. """
  293. return DbsStorageElement(Name = seName)
  294. def createDBSFileBlock(blockName):
  295. """
  296. _createDBSFileBlock_
  297. return a DbsFileBlock object with the block name provided
  298. NOTE: This method DOES NOT create a new block in DBS
  299. """
  300. return DbsFileBlock( Name = blockName)
  301. def getDBSFileBlock(dbsApiRef, procDataset, seName):
  302. """
  303. _getDBSFileBlock_
  304. Given the procDataset and seName provided, get the currently open
  305. file block for that dataset/se pair.
  306. If an open block does not exist, then create a new block and
  307. return that
  308. """
  309. logging.warning("getDBSFileBlocks(): dset, se: %s, %s" % (procDataset, seName))
  310. allBlocks = dbsApiRef.listBlocks(procDataset, block_name = "*",
  311. storage_element_name = "*")
  312. logging.warning("getDBSFileBlock(): all blocks %s" % allBlocks)
  313. openBlocks = [b for b in allBlocks if str(b['OpenForWriting']) == "1"]
  314. logging.warning("getDBSFileBlocks(): open blocks %s" % openBlocks)
  315. blockRef = None
  316. if len(openBlocks) > 1:
  317. msg = "Too many open blocks for dataset:\n"
  318. msg += "SE: %s\n" % seName
  319. msg += "Dataset: %s\n" %procDataset
  320. msg += "Using last open block\n"
  321. logging.warning(msg)
  322. blockRef = openBlocks[-1]
  323. elif len(openBlocks) == 1:
  324. blockRef = openBlocks[0]
  325. if blockRef == None:
  326. # //
  327. # // Need to create new block
  328. #//
  329. logging.warning("getDBSFileBlock(): Creating a new block...")
  330. newBlockName = dbsApiRef.insertBlock (procDataset, None ,
  331. storage_element_list = [seName])
  332. # get from DBS listBlocks API the DbsFileBlock newly inserted
  333. blocks = dbsApiRef.listBlocks(procDataset, block_name = newBlockName )
  334. if len(blocks) > 1:
  335. msg = "Too many blocks with the same name: %s:\n" % newBlockName
  336. msg += "Using last block\n"
  337. logging.warning(msg)
  338. blockRef = blocks[-1]
  339. elif len(blocks) == 1:
  340. blockRef = blocks[0]
  341. else:
  342. msg = "No FileBlock found to add files to"
  343. logging.error(msg)
  344. # FIXME: throw an error ?
  345. ## StorageElementList below is wrong: it should be a list of dictionary [ { 'Name': seName } ]
  346. ## In order to define the DbsFileBlock it should be enough to specify its blockname and
  347. ## it shouldn't be needed to specify the SE and Dataset again,
  348. ## however since this is not the case, it's safer to get the DbsFileBlock from listBlocks DBS API
  349. ## rather then defining a DbsFileBlock.
  350. # blockRef = DbsFileBlock(
  351. # Name = newBlockName,
  352. # Dataset = procDataset,
  353. # StorageElementList = [ seName ]
  354. # )
  355. logging.warning("Open FileBlock located at SE: %s to use is FileBlock: %s "%(seName,blockRef['Name']))
  356. return blockRef