PageRenderTime 81ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/modules/websearch/lib/websearch_webcoll.py

https://github.com/chokribr/invenio-1
Python | 1209 lines | 1166 code | 12 blank | 31 comment | 23 complexity | b77d0ddd39ef19f420a9551c5f7452a0 MD5 | raw file
Possible License(s): GPL-2.0

Large files files are truncated, but you can click here to view the full file

  1. ## This file is part of Invenio.
  2. ## Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 CERN.
  3. ##
  4. ## Invenio is free software; you can redistribute it and/or
  5. ## modify it under the terms of the GNU General Public License as
  6. ## published by the Free Software Foundation; either version 2 of the
  7. ## License, or (at your option) any later version.
  8. ##
  9. ## Invenio is distributed in the hope that it will be useful, but
  10. ## WITHOUT ANY WARRANTY; without even the implied warranty of
  11. ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. ## General Public License for more details.
  13. ##
  14. ## You should have received a copy of the GNU General Public License
  15. ## along with Invenio; if not, write to the Free Software Foundation, Inc.,
  16. ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
  17. """Create Invenio collection cache."""
  18. __revision__ = "$Id$"
  19. import calendar
  20. import copy
  21. import sys
  22. import cgi
  23. import re
  24. import os
  25. import string
  26. import time
  27. import cPickle
  28. from invenio.config import \
  29. CFG_CERN_SITE, \
  30. CFG_WEBSEARCH_INSTANT_BROWSE, \
  31. CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS, \
  32. CFG_WEBSEARCH_I18N_LATEST_ADDITIONS, \
  33. CFG_CACHEDIR, \
  34. CFG_SITE_LANG, \
  35. CFG_SITE_NAME, \
  36. CFG_SITE_LANGS, \
  37. CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES, \
  38. CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, \
  39. CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS, \
  40. CFG_SCOAP3_SITE
  41. from invenio.messages import gettext_set_language, language_list_long
  42. from invenio.search_engine import search_pattern_parenthesised, get_creation_date, get_field_i18nname, collection_restricted_p, sort_records, EM_REPOSITORY
  43. from invenio.dbquery import run_sql, Error, get_table_update_time
  44. from invenio.bibrank_record_sorter import get_bibrank_methods
  45. from invenio.dateutils import convert_datestruct_to_dategui, strftime
  46. from invenio.bibformat import format_record
  47. from invenio.shellutils import mymkdir
  48. from invenio.intbitset import intbitset
  49. from invenio.websearch_external_collections import \
  50. external_collection_load_states, \
  51. dico_collection_external_searches, \
  52. external_collection_sort_engine_by_name
  53. from invenio.bibtask import task_init, task_get_option, task_set_option, \
  54. write_message, task_has_option, task_update_progress, \
  55. task_sleep_now_if_required
  56. import invenio.template
  57. websearch_templates = invenio.template.load('websearch')
  58. from invenio.websearch_external_collections_searcher import external_collections_dictionary
  59. from invenio.websearch_external_collections_config import CFG_EXTERNAL_COLLECTION_TIMEOUT
  60. from invenio.websearch_external_collections_config import CFG_HOSTED_COLLECTION_TIMEOUT_NBRECS
  61. ## global vars
  62. COLLECTION_HOUSE = {} # will hold collections we treat in this run of the program; a dict of {collname2, collobject1}, ...
  63. # CFG_CACHE_LAST_UPDATED_TIMESTAMP_TOLERANCE -- cache timestamp
  64. # tolerance (in seconds), to account for the fact that an admin might
  65. # accidentally happen to edit the collection definitions at exactly
  66. # the same second when some webcoll process was about to be started.
  67. # In order to be safe, let's put an exaggerated timestamp tolerance
  68. # value such as 20 seconds:
  69. CFG_CACHE_LAST_UPDATED_TIMESTAMP_TOLERANCE = 20
  70. # CFG_CACHE_LAST_UPDATED_TIMESTAMP_FILE -- location of the cache
  71. # timestamp file:
  72. CFG_CACHE_LAST_UPDATED_TIMESTAMP_FILE = "%s/collections/last_updated" % CFG_CACHEDIR
  73. # CFG_CACHE_LAST_FAST_UPDATED_TIMESTAMP_FILE -- location of the cache
  74. # timestamp file usef when running webcoll in the fast-mode.
  75. CFG_CACHE_LAST_FAST_UPDATED_TIMESTAMP_FILE = "%s/collections/last_fast_updated" % CFG_CACHEDIR
  76. def get_collection(colname):
  77. """Return collection object from the collection house for given colname.
  78. If does not exist, then create it."""
  79. if not COLLECTION_HOUSE.has_key(colname):
  80. colobject = Collection(colname)
  81. COLLECTION_HOUSE[colname] = colobject
  82. return COLLECTION_HOUSE[colname]
  83. ## auxiliary functions:
  84. def is_selected(var, fld):
  85. "Checks if the two are equal, and if yes, returns ' selected'. Useful for select boxes."
  86. if var == fld:
  87. return ' selected="selected"'
  88. else:
  89. return ""
  90. def get_field(recID, tag):
  91. "Gets list of field 'tag' for the record with 'recID' system number."
  92. out = []
  93. digit = tag[0:2]
  94. bx = "bib%sx" % digit
  95. bibx = "bibrec_bib%sx" % digit
  96. query = "SELECT bx.value FROM %s AS bx, %s AS bibx WHERE bibx.id_bibrec='%s' AND bx.id=bibx.id_bibxxx AND bx.tag='%s'" \
  97. % (bx, bibx, recID, tag)
  98. res = run_sql(query)
  99. for row in res:
  100. out.append(row[0])
  101. return out
  102. def check_nbrecs_for_all_external_collections():
  103. """Check if any of the external collections have changed their total number of records, aka nbrecs.
  104. Return True if any of the total numbers of records have changed and False if they're all the same."""
  105. res = run_sql("SELECT name FROM collection WHERE dbquery LIKE 'hostedcollection:%';")
  106. for row in res:
  107. coll_name = row[0]
  108. if (get_collection(coll_name)).check_nbrecs_for_external_collection():
  109. return True
  110. return False
  111. class Collection:
  112. "Holds the information on collections (id,name,dbquery)."
  113. def __init__(self, name=""):
  114. "Creates collection instance by querying the DB configuration database about 'name'."
  115. self.calculate_reclist_run_already = 0 # to speed things up without much refactoring
  116. self.update_reclist_run_already = 0 # to speed things up without much refactoring
  117. self.reclist_updated_since_start = 0 # to check if webpage cache need rebuilding
  118. self.reclist_with_nonpublic_subcolls = intbitset()
  119. # temporary counters for the number of records in hosted collections
  120. self.nbrecs_tmp = None # number of records in a hosted collection
  121. self.nbrecs_from_hosted_collections = 0 # total number of records from
  122. # descendant hosted collections
  123. if not name:
  124. self.name = CFG_SITE_NAME # by default we are working on the home page
  125. self.id = 1
  126. self.dbquery = None
  127. self.nbrecs = None
  128. self.reclist = intbitset()
  129. self.old_reclist = intbitset()
  130. self.reclist_updated_since_start = 1
  131. else:
  132. self.name = name
  133. try:
  134. res = run_sql("""SELECT id,name,dbquery,nbrecs,reclist FROM collection
  135. WHERE name=%s""", (name,))
  136. if res:
  137. self.id = res[0][0]
  138. self.name = res[0][1]
  139. self.dbquery = res[0][2]
  140. self.nbrecs = res[0][3]
  141. try:
  142. self.reclist = intbitset(res[0][4])
  143. except:
  144. self.reclist = intbitset()
  145. self.reclist_updated_since_start = 1
  146. else: # collection does not exist!
  147. self.id = None
  148. self.dbquery = None
  149. self.nbrecs = None
  150. self.reclist = intbitset()
  151. self.reclist_updated_since_start = 1
  152. self.old_reclist = intbitset(self.reclist)
  153. except Error, e:
  154. print "Error %d: %s" % (e.args[0], e.args[1])
  155. sys.exit(1)
  156. def get_example_search_queries(self):
  157. """Returns list of sample search queries for this collection.
  158. """
  159. res = run_sql("""SELECT example.body FROM example
  160. LEFT JOIN collection_example on example.id=collection_example.id_example
  161. WHERE collection_example.id_collection=%s ORDER BY collection_example.score""", (self.id,))
  162. return [query[0] for query in res]
  163. def get_name(self, ln=CFG_SITE_LANG, name_type="ln", prolog="", epilog="", prolog_suffix=" ", epilog_suffix=""):
  164. """Return nicely formatted collection name for language LN.
  165. The NAME_TYPE may be 'ln' (=long name), 'sn' (=short name), etc."""
  166. out = prolog
  167. i18name = ""
  168. res = run_sql("SELECT value FROM collectionname WHERE id_collection=%s AND ln=%s AND type=%s", (self.id, ln, name_type))
  169. try:
  170. i18name += res[0][0]
  171. except IndexError:
  172. pass
  173. if i18name:
  174. out += i18name
  175. else:
  176. out += self.name
  177. out += epilog
  178. return out
  179. def get_collectionbox_name(self, ln=CFG_SITE_LANG, box_type="r"):
  180. """
  181. Return collection-specific labelling of 'Focus on' (regular
  182. collection), 'Narrow by' (virtual collection) and 'Latest
  183. addition' boxes.
  184. If translation for given language does not exist, use label
  185. for CFG_SITE_LANG. If no custom label is defined for
  186. CFG_SITE_LANG, return default label for the box.
  187. @param ln: the language of the label
  188. @param box_type: can be 'r' (=Narrow by), 'v' (=Focus on), 'l' (=Latest additions)
  189. """
  190. i18name = ""
  191. res = run_sql("SELECT value FROM collectionboxname WHERE id_collection=%s AND ln=%s AND type=%s", (self.id, ln, box_type))
  192. try:
  193. i18name = res[0][0]
  194. except IndexError:
  195. res = run_sql("SELECT value FROM collectionboxname WHERE id_collection=%s AND ln=%s AND type=%s", (self.id, CFG_SITE_LANG, box_type))
  196. try:
  197. i18name = res[0][0]
  198. except IndexError:
  199. pass
  200. if not i18name:
  201. # load the right message language
  202. _ = gettext_set_language(ln)
  203. if box_type == "v":
  204. i18name = _('Focus on:')
  205. elif box_type == "r":
  206. if CFG_SCOAP3_SITE:
  207. i18name = _('Narrow by publisher/journal:')
  208. else:
  209. i18name = _('Narrow by collection:')
  210. elif box_type == "l":
  211. i18name = _('Latest additions:')
  212. return i18name
  213. def get_ancestors(self):
  214. "Returns list of ancestors of the current collection."
  215. ancestors = []
  216. ancestors_ids = intbitset()
  217. id_son = self.id
  218. while 1:
  219. query = "SELECT cc.id_dad,c.name FROM collection_collection AS cc, collection AS c "\
  220. "WHERE cc.id_son=%d AND c.id=cc.id_dad" % int(id_son)
  221. res = run_sql(query, None, 1)
  222. if res:
  223. col_ancestor = get_collection(res[0][1])
  224. # looking for loops
  225. if self.id in ancestors_ids:
  226. write_message("Loop found in collection %s" % self.name, stream=sys.stderr)
  227. raise OverflowError("Loop found in collection %s" % self.name)
  228. else:
  229. ancestors.append(col_ancestor)
  230. ancestors_ids.add(col_ancestor.id)
  231. id_son = res[0][0]
  232. else:
  233. break
  234. ancestors.reverse()
  235. return ancestors
  236. def restricted_p(self):
  237. """Predicate to test if the collection is restricted or not. Return the contect of the
  238. `restrited' column of the collection table (typically Apache group). Otherwise return
  239. None if the collection is public."""
  240. if collection_restricted_p(self.name):
  241. return 1
  242. return None
  243. def get_sons(self, type='r'):
  244. "Returns list of direct sons of type 'type' for the current collection."
  245. sons = []
  246. id_dad = self.id
  247. query = "SELECT cc.id_son,c.name FROM collection_collection AS cc, collection AS c "\
  248. "WHERE cc.id_dad=%d AND cc.type='%s' AND c.id=cc.id_son ORDER BY score DESC, c.name ASC" % (int(id_dad), type)
  249. res = run_sql(query)
  250. for row in res:
  251. sons.append(get_collection(row[1]))
  252. return sons
  253. def get_descendants(self, type='r'):
  254. "Returns list of all descendants of type 'type' for the current collection."
  255. descendants = []
  256. descendant_ids = intbitset()
  257. id_dad = self.id
  258. query = "SELECT cc.id_son,c.name FROM collection_collection AS cc, collection AS c "\
  259. "WHERE cc.id_dad=%d AND cc.type='%s' AND c.id=cc.id_son ORDER BY score DESC" % (int(id_dad), type)
  260. res = run_sql(query)
  261. for row in res:
  262. col_desc = get_collection(row[1])
  263. # looking for loops
  264. if self.id in descendant_ids:
  265. write_message("Loop found in collection %s" % self.name, stream=sys.stderr)
  266. raise OverflowError("Loop found in collection %s" % self.name)
  267. else:
  268. descendants.append(col_desc)
  269. descendant_ids.add(col_desc.id)
  270. tmp_descendants = col_desc.get_descendants()
  271. for descendant in tmp_descendants:
  272. descendant_ids.add(descendant.id)
  273. descendants += tmp_descendants
  274. return descendants
  275. def write_cache_file(self, filename='', filebody={}):
  276. "Write a file inside collection cache."
  277. # open file:
  278. dirname = "%s/collections" % (CFG_CACHEDIR)
  279. mymkdir(dirname)
  280. fullfilename = dirname + "/%s.html" % filename
  281. try:
  282. os.umask(022)
  283. f = open(fullfilename, "wb")
  284. except IOError, v:
  285. try:
  286. (code, message) = v
  287. except:
  288. code = 0
  289. message = v
  290. print "I/O Error: " + str(message) + " (" + str(code) + ")"
  291. sys.exit(1)
  292. # print user info:
  293. write_message("... creating %s" % fullfilename, verbose=6)
  294. # print page body:
  295. cPickle.dump(filebody, f, cPickle.HIGHEST_PROTOCOL)
  296. # close file:
  297. f.close()
  298. def update_webpage_cache(self, lang):
  299. """Create collection page header, navtrail, body (including left and right stripes) and footer, and
  300. call write_cache_file() afterwards to update the collection webpage cache."""
  301. ## precalculate latest additions for non-aggregate
  302. ## collections (the info is ln and as independent)
  303. if self.dbquery:
  304. if CFG_WEBSEARCH_I18N_LATEST_ADDITIONS:
  305. self.create_latest_additions_info(ln=lang)
  306. else:
  307. self.create_latest_additions_info()
  308. # load the right message language
  309. _ = gettext_set_language(lang)
  310. # create dictionary with data
  311. cache = {"te_portalbox" : self.create_portalbox(lang, 'te'),
  312. "np_portalbox" : self.create_portalbox(lang, 'np'),
  313. "ne_portalbox" : self.create_portalbox(lang, 'ne'),
  314. "tp_portalbox" : self.create_portalbox(lang, "tp"),
  315. "lt_portalbox" : self.create_portalbox(lang, "lt"),
  316. "rt_portalbox" : self.create_portalbox(lang, "rt"),
  317. "last_updated" : convert_datestruct_to_dategui(time.localtime(),
  318. ln=lang)}
  319. for aas in CFG_WEBSEARCH_ENABLED_SEARCH_INTERFACES: # do light, simple and advanced search pages:
  320. cache["navtrail_%s" % aas] = self.create_navtrail_links(aas, lang)
  321. cache["searchfor_%s" % aas] = self.create_searchfor(aas, lang)
  322. cache["narrowsearch_%s" % aas] = self.create_narrowsearch(aas, lang, 'r')
  323. cache["focuson_%s" % aas] = self.create_narrowsearch(aas, lang, "v")+ \
  324. self.create_external_collections_box(lang)
  325. cache["instantbrowse_%s" % aas] = self.create_instant_browse(aas=aas, ln=lang)
  326. # write cache file
  327. self.write_cache_file("%s-ln=%s"%(self.name, lang), cache)
  328. return cache
  329. def create_navtrail_links(self, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG):
  330. """Creates navigation trail links, i.e. links to collection
  331. ancestors (except Home collection). If aas==1, then links to
  332. Advanced Search interfaces; otherwise Simple Search.
  333. """
  334. dads = []
  335. for dad in self.get_ancestors():
  336. if dad.name != CFG_SITE_NAME: # exclude Home collection
  337. dads.append((dad.name, dad.get_name(ln)))
  338. return websearch_templates.tmpl_navtrail_links(
  339. aas=aas, ln=ln, dads=dads)
  340. def create_portalbox(self, lang=CFG_SITE_LANG, position="rt"):
  341. """Creates portalboxes of language CFG_SITE_LANG of the position POSITION by consulting DB configuration database.
  342. The position may be: 'lt'='left top', 'rt'='right top', etc."""
  343. out = ""
  344. query = "SELECT p.title,p.body FROM portalbox AS p, collection_portalbox AS cp "\
  345. " WHERE cp.id_collection=%d AND p.id=cp.id_portalbox AND cp.ln='%s' AND cp.position='%s' "\
  346. " ORDER BY cp.score DESC" % (self.id, lang, position)
  347. res = run_sql(query)
  348. for row in res:
  349. title, body = row[0], row[1]
  350. if title:
  351. out += websearch_templates.tmpl_portalbox(title = title,
  352. body = body)
  353. else:
  354. # no title specified, so print body ``as is'' only:
  355. out += body
  356. return out
  357. def create_narrowsearch(self, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG, type="r"):
  358. """Creates list of collection descendants of type 'type' under title 'title'.
  359. If aas==1, then links to Advanced Search interfaces; otherwise Simple Search.
  360. Suitable for 'Narrow search' and 'Focus on' boxes."""
  361. # get list of sons and analyse it
  362. sons = self.get_sons(type)
  363. if not sons:
  364. return ''
  365. # get descendents
  366. descendants = self.get_descendants(type)
  367. grandsons = []
  368. if CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS:
  369. # load grandsons for each son
  370. for son in sons:
  371. grandsons.append(son.get_sons())
  372. # return ""
  373. return websearch_templates.tmpl_narrowsearch(
  374. aas = aas,
  375. ln = ln,
  376. type = type,
  377. father = self,
  378. has_grandchildren = len(descendants)>len(sons),
  379. sons = sons,
  380. display_grandsons = CFG_WEBSEARCH_NARROW_SEARCH_SHOW_GRANDSONS,
  381. grandsons = grandsons
  382. )
  383. def create_external_collections_box(self, ln=CFG_SITE_LANG):
  384. external_collection_load_states()
  385. if not dico_collection_external_searches.has_key(self.id):
  386. return ""
  387. engines_list = external_collection_sort_engine_by_name(dico_collection_external_searches[self.id])
  388. return websearch_templates.tmpl_searchalso(ln, engines_list, self.id)
  389. def create_latest_additions_info(self, rg=CFG_WEBSEARCH_INSTANT_BROWSE, ln=CFG_SITE_LANG):
  390. """
  391. Create info about latest additions that will be used for
  392. create_instant_browse() later.
  393. """
  394. self.latest_additions_info = []
  395. if self.nbrecs and self.reclist:
  396. # firstly, get last 'rg' records:
  397. recIDs = list(self.reclist)
  398. of = 'hb'
  399. # CERN hack begins: tweak latest additions for selected collections:
  400. if CFG_CERN_SITE:
  401. # alter recIDs list for some CERN collections:
  402. this_year = time.strftime("%Y", time.localtime())
  403. if self.name in ['CERN Yellow Reports','Videos']:
  404. last_year = str(int(this_year) - 1)
  405. # detect recIDs only from this and past year:
  406. recIDs = list(self.reclist & \
  407. search_pattern_parenthesised(p='year:%s or year:%s' % \
  408. (this_year, last_year)))
  409. elif self.name in ['VideosXXX']:
  410. # detect recIDs only from this year:
  411. recIDs = list(self.reclist & \
  412. search_pattern_parenthesised(p='year:%s' % this_year))
  413. elif self.name == 'CMS Physics Analysis Summaries' and \
  414. 1281585 in self.reclist:
  415. # REALLY, REALLY temporary hack
  416. recIDs = list(self.reclist)
  417. recIDs.remove(1281585)
  418. # apply special filters:
  419. if self.name in ['Videos']:
  420. # select only videos with movies:
  421. recIDs = list(intbitset(recIDs) & \
  422. search_pattern_parenthesised(p='collection:"PUBLVIDEOMOVIE"'))
  423. of = 'hvp'
  424. # sort some CERN collections specially:
  425. if self.name in ['Videos',
  426. 'Video Clips',
  427. 'Video Movies',
  428. 'Video News',
  429. 'Video Rushes',
  430. 'Webcast',
  431. 'ATLAS Videos',
  432. 'Restricted Video Movies',
  433. 'Restricted Video Rushes',
  434. 'LHC First Beam Videos',
  435. 'CERN openlab Videos']:
  436. recIDs = sort_records(None, recIDs, '269__c')
  437. elif self.name in ['LHCb Talks']:
  438. recIDs = sort_records(None, recIDs, 'reportnumber')
  439. elif self.name in ['CERN Yellow Reports']:
  440. recIDs = sort_records(None, recIDs, '084__a')
  441. # CERN hack ends.
  442. total = len(recIDs)
  443. to_display = min(rg, total)
  444. for idx in range(total-1, total-to_display-1, -1):
  445. recid = recIDs[idx]
  446. self.latest_additions_info.append({'id': recid,
  447. 'format': format_record(recid, of, ln=ln),
  448. 'date': get_creation_date(recid, fmt="%Y-%m-%d<br />%H:%i")})
  449. return
  450. def create_instant_browse(self, rg=CFG_WEBSEARCH_INSTANT_BROWSE, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG):
  451. "Searches database and produces list of last 'rg' records."
  452. if self.restricted_p():
  453. return websearch_templates.tmpl_box_restricted_content(ln = ln)
  454. if str(self.dbquery).startswith("hostedcollection:"):
  455. return websearch_templates.tmpl_box_hosted_collection(ln = ln)
  456. if rg == 0:
  457. # do not show latest additions box
  458. return ""
  459. # CERN hack: do not display latest additions for some CERN collections:
  460. if CFG_CERN_SITE and self.name in ['Periodicals', 'Electronic Journals',
  461. 'Press Office Photo Selection',
  462. 'Press Office Video Selection']:
  463. return ""
  464. try:
  465. self.latest_additions_info
  466. latest_additions_info_p = True
  467. except:
  468. latest_additions_info_p = False
  469. if latest_additions_info_p:
  470. passIDs = []
  471. for idx in range(0, min(len(self.latest_additions_info), rg)):
  472. # CERN hack: display the records in a grid layout, so do not show the related links
  473. if CFG_CERN_SITE and self.name in ['Videos']:
  474. passIDs.append({'id': self.latest_additions_info[idx]['id'],
  475. 'body': self.latest_additions_info[idx]['format'],
  476. 'date': self.latest_additions_info[idx]['date']})
  477. else:
  478. passIDs.append({'id': self.latest_additions_info[idx]['id'],
  479. 'body': self.latest_additions_info[idx]['format'] + \
  480. websearch_templates.tmpl_record_links(recid=self.latest_additions_info[idx]['id'],
  481. rm='citation',
  482. ln=ln),
  483. 'date': self.latest_additions_info[idx]['date']})
  484. if self.nbrecs > rg:
  485. url = websearch_templates.build_search_url(
  486. cc=self.name, jrec=rg+1, ln=ln, aas=aas)
  487. else:
  488. url = ""
  489. # CERN hack: display the records in a grid layout
  490. if CFG_CERN_SITE and self.name in ['Videos']:
  491. return websearch_templates.tmpl_instant_browse(
  492. aas=aas, ln=ln, recids=passIDs, more_link=url, grid_layout=True, father=self)
  493. return websearch_templates.tmpl_instant_browse(
  494. aas=aas, ln=ln, recids=passIDs, more_link=url, father=self)
  495. return websearch_templates.tmpl_box_no_records(ln=ln)
  496. def create_searchoptions(self):
  497. "Produces 'Search options' portal box."
  498. box = ""
  499. query = """SELECT DISTINCT(cff.id_field),f.code,f.name FROM collection_field_fieldvalue AS cff, field AS f
  500. WHERE cff.id_collection=%d AND cff.id_fieldvalue IS NOT NULL AND cff.id_field=f.id
  501. ORDER BY cff.score DESC""" % self.id
  502. res = run_sql(query)
  503. if res:
  504. for row in res:
  505. field_id = row[0]
  506. field_code = row[1]
  507. field_name = row[2]
  508. query_bis = """SELECT fv.value,fv.name FROM fieldvalue AS fv, collection_field_fieldvalue AS cff
  509. WHERE cff.id_collection=%d AND cff.type='seo' AND cff.id_field=%d AND fv.id=cff.id_fieldvalue
  510. ORDER BY cff.score_fieldvalue DESC, cff.score DESC, fv.name ASC""" % (self.id, field_id)
  511. res_bis = run_sql(query_bis)
  512. if res_bis:
  513. values = [{'value' : '', 'text' : 'any' + ' ' + field_name}] # FIXME: internationalisation of "any"
  514. for row_bis in res_bis:
  515. values.append({'value' : cgi.escape(row_bis[0], 1), 'text' : row_bis[1]})
  516. box += websearch_templates.tmpl_select(
  517. fieldname = field_code,
  518. values = values
  519. )
  520. return box
  521. def create_sortoptions(self, ln=CFG_SITE_LANG):
  522. """Produces 'Sort options' portal box."""
  523. # load the right message language
  524. _ = gettext_set_language(ln)
  525. box = ""
  526. query = """SELECT f.code,f.name FROM field AS f, collection_field_fieldvalue AS cff
  527. WHERE id_collection=%d AND cff.type='soo' AND cff.id_field=f.id
  528. ORDER BY cff.score DESC, f.name ASC""" % self.id
  529. values = [{'value' : '', 'text': "- %s -" % _("latest first")}]
  530. res = run_sql(query)
  531. if res:
  532. for row in res:
  533. values.append({'value' : row[0], 'text': get_field_i18nname(row[1], ln)})
  534. else:
  535. for tmp in ('title', 'author', 'report number', 'year'):
  536. values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)})
  537. box = websearch_templates.tmpl_select(
  538. fieldname = 'sf',
  539. css_class = 'address',
  540. values = values
  541. )
  542. box += websearch_templates.tmpl_select(
  543. fieldname = 'so',
  544. css_class = 'address',
  545. values = [
  546. {'value' : 'a' , 'text' : _("asc.")},
  547. {'value' : 'd' , 'text' : _("desc.")}
  548. ]
  549. )
  550. return box
  551. def create_rankoptions(self, ln=CFG_SITE_LANG):
  552. "Produces 'Rank options' portal box."
  553. # load the right message language
  554. _ = gettext_set_language(ln)
  555. values = [{'value' : '', 'text': "- %s %s -" % (string.lower(_("OR")), _("rank by"))}]
  556. for (code, name) in get_bibrank_methods(self.id, ln):
  557. values.append({'value' : code, 'text': name})
  558. box = websearch_templates.tmpl_select(
  559. fieldname = 'rm',
  560. css_class = 'address',
  561. values = values
  562. )
  563. return box
  564. def create_displayoptions(self, ln=CFG_SITE_LANG):
  565. "Produces 'Display options' portal box."
  566. # load the right message language
  567. _ = gettext_set_language(ln)
  568. values = []
  569. for i in ['10', '25', '50', '100', '250', '500']:
  570. values.append({'value' : i, 'text' : i + ' ' + _("results")})
  571. box = websearch_templates.tmpl_select(
  572. fieldname = 'rg',
  573. selected = str(CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS),
  574. css_class = 'address',
  575. values = values
  576. )
  577. if self.get_sons():
  578. box += websearch_templates.tmpl_select(
  579. fieldname = 'sc',
  580. css_class = 'address',
  581. values = [
  582. {'value' : '1' , 'text' : CFG_SCOAP3_SITE and _("split by publisher/journal") or _("split by collection")},
  583. {'value' : '0' , 'text' : _("single list")}
  584. ]
  585. )
  586. return box
  587. def create_formatoptions(self, ln=CFG_SITE_LANG):
  588. "Produces 'Output format options' portal box."
  589. # load the right message language
  590. _ = gettext_set_language(ln)
  591. box = ""
  592. values = []
  593. query = """SELECT f.code,f.name FROM format AS f, collection_format AS cf
  594. WHERE cf.id_collection=%d AND cf.id_format=f.id AND f.visibility='1'
  595. ORDER BY cf.score DESC, f.name ASC""" % self.id
  596. res = run_sql(query)
  597. if res:
  598. for row in res:
  599. values.append({'value' : row[0], 'text': row[1]})
  600. else:
  601. values.append({'value' : 'hb', 'text' : "HTML %s" % _("brief")})
  602. box = websearch_templates.tmpl_select(
  603. fieldname = 'of',
  604. css_class = 'address',
  605. values = values
  606. )
  607. return box
  608. def create_searchwithin_selection_box(self, fieldname='f', value='', ln='en'):
  609. """Produces 'search within' selection box for the current collection."""
  610. # get values
  611. query = """SELECT f.code,f.name FROM field AS f, collection_field_fieldvalue AS cff
  612. WHERE cff.type='sew' AND cff.id_collection=%d AND cff.id_field=f.id
  613. ORDER BY cff.score DESC, f.name ASC""" % self.id
  614. res = run_sql(query)
  615. values = [{'value' : '', 'text' : get_field_i18nname("any field", ln)}]
  616. if res:
  617. for row in res:
  618. values.append({'value' : row[0], 'text' : get_field_i18nname(row[1], ln)})
  619. else:
  620. if CFG_CERN_SITE:
  621. for tmp in ['title', 'author', 'abstract', 'report number', 'year']:
  622. values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)})
  623. else:
  624. for tmp in ['title', 'author', 'abstract', 'keyword', 'report number', 'journal', 'year', 'fulltext', 'reference']:
  625. values.append({'value' : tmp.replace(' ', ''), 'text' : get_field_i18nname(tmp, ln)})
  626. return websearch_templates.tmpl_searchwithin_select(
  627. fieldname = fieldname,
  628. ln = ln,
  629. selected = value,
  630. values = values
  631. )
  632. def create_searchexample(self):
  633. "Produces search example(s) for the current collection."
  634. out = "$collSearchExamples = getSearchExample(%d, $se);" % self.id
  635. return out
  636. def create_searchfor(self, aas=CFG_WEBSEARCH_DEFAULT_SEARCH_INTERFACE, ln=CFG_SITE_LANG):
  637. "Produces either Simple or Advanced 'Search for' box for the current collection."
  638. if aas == 1:
  639. return self.create_searchfor_advanced(ln)
  640. elif aas == 0:
  641. return self.create_searchfor_simple(ln)
  642. else:
  643. return self.create_searchfor_light(ln)
  644. def create_searchfor_light(self, ln=CFG_SITE_LANG):
  645. "Produces light 'Search for' box for the current collection."
  646. return websearch_templates.tmpl_searchfor_light(
  647. ln=ln,
  648. collection_id = self.name,
  649. collection_name=self.get_name(ln=ln),
  650. record_count=self.nbrecs,
  651. example_search_queries=self.get_example_search_queries(),
  652. )
  653. def create_searchfor_simple(self, ln=CFG_SITE_LANG):
  654. "Produces simple 'Search for' box for the current collection."
  655. return websearch_templates.tmpl_searchfor_simple(
  656. ln=ln,
  657. collection_id = self.name,
  658. collection_name=self.get_name(ln=ln),
  659. record_count=self.nbrecs,
  660. middle_option = self.create_searchwithin_selection_box(ln=ln),
  661. )
  662. def create_searchfor_advanced(self, ln=CFG_SITE_LANG):
  663. "Produces advanced 'Search for' box for the current collection."
  664. return websearch_templates.tmpl_searchfor_advanced(
  665. ln = ln,
  666. collection_id = self.name,
  667. collection_name=self.get_name(ln=ln),
  668. record_count=self.nbrecs,
  669. middle_option_1 = self.create_searchwithin_selection_box('f1', ln=ln),
  670. middle_option_2 = self.create_searchwithin_selection_box('f2', ln=ln),
  671. middle_option_3 = self.create_searchwithin_selection_box('f3', ln=ln),
  672. searchoptions = self.create_searchoptions(),
  673. sortoptions = self.create_sortoptions(ln),
  674. rankoptions = self.create_rankoptions(ln),
  675. displayoptions = self.create_displayoptions(ln),
  676. formatoptions = self.create_formatoptions(ln)
  677. )
  678. def calculate_reclist(self):
  679. """
  680. Calculate, set and return the (reclist,
  681. reclist_with_nonpublic_subcolls,
  682. nbrecs_from_hosted_collections)
  683. tuple for the given collection."""
  684. if str(self.dbquery).startswith("hostedcollection:"):
  685. # we don't normally use this function to calculate the reclist
  686. # for hosted collections. In case we do, recursively for a regular
  687. # ancestor collection, then quickly return the object attributes.
  688. return (self.reclist,
  689. self.reclist_with_nonpublic_subcolls,
  690. self.nbrecs)
  691. if self.calculate_reclist_run_already:
  692. # do we really have to recalculate? If not,
  693. # then return the object attributes
  694. return (self.reclist,
  695. self.reclist_with_nonpublic_subcolls,
  696. self.nbrecs_from_hosted_collections)
  697. write_message("... calculating reclist of %s" % self.name, verbose=6)
  698. reclist = intbitset() # will hold results for public sons only; good for storing into DB
  699. reclist_with_nonpublic_subcolls = intbitset() # will hold results for both public and nonpublic sons; good for deducing total
  700. # number of documents
  701. nbrecs_from_hosted_collections = 0 # will hold the total number of records from descendant hosted collections
  702. if not self.dbquery:
  703. # A - collection does not have dbquery, so query recursively all its sons
  704. # that are either non-restricted or that have the same restriction rules
  705. for coll in self.get_sons():
  706. coll_reclist,\
  707. coll_reclist_with_nonpublic_subcolls,\
  708. coll_nbrecs_from_hosted_collection = coll.calculate_reclist()
  709. if ((coll.restricted_p() is None) or
  710. (coll.restricted_p() == self.restricted_p())):
  711. # add this reclist ``for real'' only if it is public
  712. reclist.union_update(coll_reclist)
  713. reclist_with_nonpublic_subcolls.union_update(coll_reclist_with_nonpublic_subcolls)
  714. # increment the total number of records from descendant hosted collections
  715. nbrecs_from_hosted_collections += coll_nbrecs_from_hosted_collection
  716. else:
  717. # B - collection does have dbquery, so compute it:
  718. # (note: explicitly remove DELETED records)
  719. if CFG_CERN_SITE:
  720. reclist = search_pattern_parenthesised(None, self.dbquery + \
  721. ' -980__:"DELETED" -980__:"DUMMY"', ap=-9) #ap=-9 for allow queries containing hidden tags
  722. else:
  723. reclist = search_pattern_parenthesised(None, self.dbquery + ' -980__:"DELETED"', ap=-9) #ap=-9 allow queries containing hidden tags
  724. reclist_with_nonpublic_subcolls = copy.deepcopy(reclist)
  725. # store the results:
  726. self.nbrecs_from_hosted_collections = nbrecs_from_hosted_collections
  727. self.nbrecs = len(reclist_with_nonpublic_subcolls) + \
  728. nbrecs_from_hosted_collections
  729. self.reclist = reclist
  730. self.reclist_with_nonpublic_subcolls = reclist_with_nonpublic_subcolls
  731. # last but not least, update the speed-up flag:
  732. self.calculate_reclist_run_already = 1
  733. # return the two sets, as well as
  734. # the total number of records from descendant hosted collections:
  735. return (self.reclist,
  736. self.reclist_with_nonpublic_subcolls,
  737. self.nbrecs_from_hosted_collections)
  738. def calculate_nbrecs_for_external_collection(self, timeout=CFG_EXTERNAL_COLLECTION_TIMEOUT):
  739. """Calculate the total number of records, aka nbrecs, for given external collection."""
  740. #if self.calculate_reclist_run_already:
  741. # do we have to recalculate?
  742. #return self.nbrecs
  743. #write_message("... calculating nbrecs of external collection %s" % self.name, verbose=6)
  744. if external_collections_dictionary.has_key(self.name):
  745. engine = external_collections_dictionary[self.name]
  746. if engine.parser:
  747. self.nbrecs_tmp = engine.parser.parse_nbrecs(timeout)
  748. if self.nbrecs_tmp >= 0: return self.nbrecs_tmp
  749. # the parse_nbrecs() function returns negative values for some specific cases
  750. # maybe we can handle these specific cases, some warnings or something
  751. # for now the total number of records remains silently the same
  752. else: return self.nbrecs
  753. else: write_message("External collection %s does not have a parser!" % self.name, verbose=6)
  754. else: write_message("External collection %s not found!" % self.name, verbose=6)
  755. return 0
  756. # last but not least, update the speed-up flag:
  757. #self.calculate_reclist_run_already = 1
  758. def check_nbrecs_for_external_collection(self):
  759. """Check if the external collections has changed its total number of records, aka nbrecs.
  760. Rerurns True if the total number of records has changed and False if it's the same"""
  761. write_message("*** self.nbrecs = %s / self.cal...ion = %s ***" % (str(self.nbrecs), str(self.calculate_nbrecs_for_external_collection())), verbose=6)
  762. write_message("*** self.nbrecs != self.cal...ion = %s ***" % (str(self.nbrecs != self.calculate_nbrecs_for_external_collection()),), verbose=6)
  763. return self.nbrecs != self.calculate_nbrecs_for_external_collection(CFG_HOSTED_COLLECTION_TIMEOUT_NBRECS)
  764. def set_nbrecs_for_external_collection(self):
  765. """Set this external collection's total number of records, aka nbrecs"""
  766. if self.calculate_reclist_run_already:
  767. # do we have to recalculate?
  768. return
  769. write_message("... calculating nbrecs of external collection %s" % self.name, verbose=6)
  770. if self.nbrecs_tmp:
  771. self.nbrecs = self.nbrecs_tmp
  772. else:
  773. self.nbrecs = self.calculate_nbrecs_for_external_collection(CFG_HOSTED_COLLECTION_TIMEOUT_NBRECS)
  774. # last but not least, update the speed-up flag:
  775. self.calculate_reclist_run_already = 1
  776. def update_reclist(self):
  777. "Update the record universe for given collection; nbrecs, reclist of the collection table."
  778. if self.update_reclist_run_already:
  779. # do we have to reupdate?
  780. return 0
  781. write_message("... updating reclist of %s (%s recs)" % (self.name, self.nbrecs), verbose=6)
  782. sys.stdout.flush()
  783. try:
  784. ## In principle we could skip this update if old_reclist==reclist
  785. ## however we just update it here in case of race-conditions.
  786. run_sql("UPDATE collection SET nbrecs=%s, reclist=%s WHERE id=%s",
  787. (self.nbrecs, self.reclist.fastdump(), self.id))
  788. if self.old_reclist != self.reclist:
  789. self.reclist_updated_since_start = 1
  790. else:
  791. write_message("... no changes in reclist detected", verbose=6)
  792. except Error, e:
  793. print "Database Query Error %d: %s." % (e.args[0], e.args[1])
  794. sys.exit(1)
  795. # last but not least, update the speed-up flag:
  796. self.update_reclist_run_already = 1
  797. return 0
  798. def perform_display_collection(colID, colname, aas, ln, em, show_help_boxes):
  799. """Returns the data needed to display a collection page
  800. The arguments are as follows:
  801. colID - id of the collection to display
  802. colname - name of the collection to display
  803. aas - 0 if simple search, 1 if advanced search
  804. ln - language of the page
  805. em - code to display just part of the page
  806. show_help_boxes - whether to show the help boxes or not"""
  807. # check and update cache if necessary
  808. cachedfile = open("%s/collections/%s-ln=%s.html" %
  809. (CFG_CACHEDIR, colname, ln), "rb")
  810. try:
  811. data = cPickle.load(cachedfile)
  812. except ValueError:
  813. data = get_collection(colname).update_webpage_cache(ln)
  814. cachedfile.close()
  815. # check em value to return just part of the page
  816. if em != "":
  817. if EM_REPOSITORY["search_box"] not in em:
  818. data["searchfor_%s" % aas] = ""
  819. if EM_REPOSITORY["see_also_box"] not in em:
  820. data["focuson_%s" % aas] = ""
  821. if EM_REPOSITORY["all_portalboxes"] not in em:
  822. if EM_REPOSITORY["te_portalbox"] not in em:
  823. data["te_portalbox"] = ""
  824. if EM_REPOSITORY["np_portalbox"] not in em:
  825. data["np_portalbox"] = ""
  826. if EM_REPOSITORY["ne_portalbox"] not in em:
  827. data["ne_portalbox"] = ""
  828. if EM_REPOSITORY["tp_portalbox"] not in em:
  829. data["tp_portalbox"] = ""
  830. if EM_REPOSITORY["lt_portalbox"] not in em:
  831. data["lt_portalbox"] = ""
  832. if EM_REPOSITORY["rt_portalbox"] not in em:
  833. data["rt_portalbox"] = ""
  834. c_body = websearch_templates.tmpl_webcoll_body(ln, colID, data["te_portalbox"],
  835. data["searchfor_%s"%aas], data["np_portalbox"], data["narrowsearch_%s"%aas],
  836. data["focuson_%s"%aas], data["instantbrowse_%s"%aas], data["ne_portalbox"],
  837. em=="" or EM_REPOSITORY["body"] in em)
  838. if show_help_boxes <= 0:
  839. data["rt_portalbox"] = ""
  840. return (c_body, data["navtrail_%s"%aas], data["lt_portalbox"], data["rt_portalbox"],
  841. data["tp_portalbox"], data["te_portalbox"], data["last_updated"])
  842. def get_datetime(var, format_string="%Y-%m-%d %H:%M:%S"):
  843. """Returns a date string according to the format string.
  844. It can handle normal date strings and shifts with respect
  845. to now."""
  846. date = time.time()
  847. shift_re = re.compile("([-\+]{0,1})([\d]+)([dhms])")
  848. factors = {"d":24*3600, "h":3600, "m":60, "s":1}
  849. m = shift_re.match(var)
  850. if m:
  851. sign = m.groups()[0] == "-" and -1 or 1
  852. factor = factors[m.groups()[2]]
  853. value = float(m.groups()[1])
  854. date = time.localtime(date + sign * factor * value)
  855. date = strftime(format_string, date)
  856. else:
  857. date = time.strptime(var, format_string)
  858. date = strftime(format_string, date)
  859. return date
  860. def get_current_time_timestamp():
  861. """Return timestamp corresponding to the current time."""
  862. return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  863. def compare_timestamps_with_tolerance(timestamp1,
  864. timestamp2,
  865. tolerance=0):
  866. """Compare two timestamps TIMESTAMP1 and TIMESTAMP2, of the form
  867. '2005-03-31 17:37:26'. Optionally receives a TOLERANCE argument
  868. (in seconds). Return -1 if TIMESTAMP1 is less than TIMESTAMP2
  869. minus TOLERANCE, 0 if they are equal within TOLERANCE limit,
  870. and 1 if TIMESTAMP1 is greater than TIMESTAMP2 plus TOLERANCE.
  871. """
  872. # remove any trailing .00 in timestamps:
  873. timestamp1 = re.sub(r'\.[0-9]+$', '', timestamp1)
  874. timestamp2 = re.sub(r'\.[0-9]+$', '', timestamp2)
  875. # first convert timestamps to Unix epoch seconds:
  876. timestamp1_seconds = calendar.timegm(time.strptime(timestamp1, "%Y-%m-%d %H:%M:%S"))
  877. timestamp2_seconds = calendar.timegm(time.strptime(timestamp2, "%Y-%m-%d %H:%M:%S"))
  878. # now compare them:
  879. if timestamp1_seconds < timestamp2_seconds - tolerance:
  880. return -1
  881. elif timestamp1_seconds > timestamp2_seconds + tolerance:
  882. return 1
  883. else:
  884. return 0
  885. def get_database_last_updated_timestamp():
  886. """Return last updated timestamp for collection-related and
  887. record-related database tables.
  888. """
  889. database_tables_timestamps = []
  890. database_tables_timestamps.append(get_table_update_time('bibrec'))
  891. ## In INSPIRE bibfmt is on innodb and there is not such configuration
  892. bibfmt_last_update = run_sql("SELECT max(last_updated) FROM bibfmt")
  893. if bibfmt_last_update and bibfmt_last_update[0][0]:
  894. database_tables_timestamps.append(str(bibfmt_last_update[0][0]))
  895. try:
  896. database_tables_timestamps.append(get_table_update_time('idxWORD%'))
  897. except ValueError:
  898. # There are no indexes in the database. That's OK.
  899. pass
  900. database_tables_timestamps.append(get_table_update_time('collection%'))
  901. database_tables_timestamps.append(get_table_update_time('portalbox'))
  902. database_tables_timestamps.append(get_table_update_time('field%'))
  903. database_tables_timestamps.append(get_table_update_time('format%'))
  904. database_tables_timestamps.append(get_table_update_time('rnkMETHODNAME'))
  905. database_tables_timestamps.append(get_table_update_time('accROLE_accACTION_accARGUMENT', run_on_slave=True))
  906. return max(database_tables_timestamps)
  907. def get_cache_last_updated_timestamp():
  908. """Return last updated cache timestamp."""
  909. try:
  910. f = open(CFG_CACHE_LAST_UPDATED_TIMESTAMP_FILE, "r")
  911. except:
  912. return "1970-01-01 00:00:00"
  913. timestamp = f.read()
  914. f.close()
  915. return timestamp
  916. def set_cache_last_updated_timestamp(timestamp):
  917. """Set last updated cache timestamp to TIMESTAMP."""
  918. try:
  919. f = open(CFG_CACHE_LAST_UPDATED_TIMESTAMP_FILE, "w")
  920. except:
  921. pass
  922. f.write(timestamp)
  923. f.close()
  924. return timestamp
  925. def main():
  926. """Main that construct all the bibtask."""
  927. task_init(authorization_action="runwebcoll",
  928. authorization_msg="WebColl Task Submission",
  929. description="""Description:
  930. webcoll updates the collection cache (record universe for a
  931. given collection plus web page elements) based on invenio.conf and DB
  932. configuration parameters. If the collection name is passed as an argument,
  933. only this collection's cache will be updated. If the recursive option is
  934. set as well, the collection's descendants will also be updated.\n""",
  935. help_specific_usage=" -c, --collection\t Update cache for the given "
  936. "collection only. [all]\n"
  937. " -r, --recursive\t Update cache for the given collection and all its\n"
  938. "\t\t\t descendants (to be used in combination with -c). [no]\n"
  939. " -q, --quick\t\t Skip webpage cache update for those collections whose\n"
  940. "\t\t\t reclist was not changed. Note: if you use this option, it is advised\n"
  941. "\t\t\t to schedule, e.g. a nightly 'webcoll --force'. [no]\n"
  942. " -f, --force\t\t Force update even if cache is up to date. [no]\n"
  943. " -p, --part\t\t Update only certain cache parts (1=reclist,"
  944. " 2=webpage). [both]\n"
  945. " -l, --language\t Update pages in only certain language"
  946. " (e.g. fr,it,...). [all]\n",
  947. version=__revision__,
  948. specific_params=("c:rqfp:l:", [
  949. "collection=",
  950. "recursive",
  951. "quick",
  952. "force",
  953. "part=",
  954. "language="
  955. ]),
  956. task_submit_elaborate_specific_parameter_fnc=task_submit_elaborate_specific_parameter,
  957. task_submit_check_options_fnc=task_submit_check_options,
  958. task_run_fnc=task_run_core)
  959. def task_submit_elaborate_specific_parameter(key, value, opts, args):
  960. """ Given the string key it checks it's meaning, eventually using the value.
  961. Usually it fills some key in the options dict.
  962. It must return True if it has elaborated the key, False, if it doesn'…

Large files files are truncated, but you can click here to view the full file