PageRenderTime 100ms CodeModel.GetById 32ms RepoModel.GetById 8ms app.codeStats 0ms

/opentasks/opentasks.py

https://bitbucket.org/grouplens/suggestbot
Python | 1041 lines | 980 code | 28 blank | 33 comment | 20 complexity | f14ba4645259c7082159abc3452a5dfd MD5 | raw file
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. """
  4. Program to update a list of open tasks for a Wikipedia.
  5. Copyright (C) 2012 Morten Wang
  6. This library is free software; you can redistribute it and/or
  7. modify it under the terms of the GNU Library General Public
  8. License as published by the Free Software Foundation; either
  9. version 2 of the License, or (at your option) any later version.
  10. This library is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. Library General Public License for more details.
  14. You should have received a copy of the GNU Library General Public
  15. License along with this library; if not, write to the
  16. Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
  17. Boston, MA 02110-1301, USA.
  18. """
  19. import os;
  20. import sys;
  21. import re;
  22. import random;
  23. import oursql;
  24. from datetime import datetime;
  25. import pywikibot;
  26. import PopQual;
  27. class DummyConfig:
  28. """
  29. Dummy configuration class that exposes C{getConfig} and C{setConfig} methods that
  30. allows retrieval of what would've been SuggestBot's configuration values, used
  31. in the PopQual library.
  32. """
  33. def __init__(self):
  34. self.config = {
  35. 'WP_LANGCODE': u"en",
  36. 'CLASSIFIER_HOSTNAME': 'localhost',
  37. 'CLASSIFIER_HOSTPORT': 10129,
  38. 'QUALWS_URL': u'http://toolserver.org/~nettrom/suggestbot/quality-metadata.fcgi',
  39. };
  40. def getConfig(self, key=None):
  41. if not key:
  42. return None;
  43. try:
  44. return self.config[key];
  45. except KeyError:
  46. return None;
  47. def setConfig(self, key=None, value=None):
  48. if not key:
  49. return None;
  50. self.config[key] = value;
  51. return True;
  52. class OpenTaskUpdater:
  53. def __init__(self, verbose=False, lang=None, mysqlConf=None,
  54. taskPage=None, taskDef=None, pagesPerCategory=5,
  55. editComment=None, testRun=False, samplingFactor=20,
  56. classifierFile=None, logDBHost=None, logDBName=None,
  57. logTableName=None, maxDBQueryAttempts=3):
  58. """
  59. Instantiate an object intended to update the list of open tasks.
  60. @param verbose: Write informational output?
  61. @type verbose: bool
  62. @param lang: Language code of the Wikipedia we're working on.
  63. @type lang: str
  64. @param mysqlConf: Path to the .my.cnf used for MySQL authentication
  65. @type mysqlConf: str
  66. @param taskPage: Title of the page which contains the open tasks
  67. @type taskPage: unicode
  68. @param taskDef: Dictionary mapping task IDs to categories containing tasks
  69. @type taskDef: dict
  70. @param pagesPerCategory: Number of pages we want per task category
  71. @type pagesPerCategory: int
  72. @param editComment: Edit comment used on successful update
  73. @type editComment: unicode
  74. @param testRun: Do a test run? Prints the resulting wikitext to stdout
  75. @type testRun: bool
  76. @param samplingFactor: Multiplier to use for oversampling and selection
  77. based on popularity/quality. (0 = no oversampling)
  78. @type samplingFactor: int
  79. @param classifierFile: name of file containing hostname & port where
  80. the quality classification server is listening.
  81. @type classifierFile: str
  82. @param logDBHost: hostname of the database server used for logging
  83. @type logDBHost: str
  84. @param logDBName: name of the database used for logging
  85. @type logDBName: str
  86. @param logTableName: name of the table used for logging
  87. @type logTableName: str
  88. @param maxDBQueryAttempts: max number of database queries attempts
  89. we will make before aborting.
  90. @type maxDBQueryAttempts: int
  91. """
  92. self.lang = 'en';
  93. if lang:
  94. self.lang = lang;
  95. self.mysqlConf = "~/.my.cnf";
  96. if mysqlConf:
  97. self.mysqlConf = mysqlConf;
  98. self.numPages = pagesPerCategory;
  99. self.editComment = u"Updating list of open tasks...";
  100. if editComment:
  101. self.editComment = editComment;
  102. self.taskPage = u"Wikipedia:Community portal/Opentask";
  103. if taskPage:
  104. self.taskPage = taskPage;
  105. if taskDef:
  106. self.taskDef = taskDef;
  107. else:
  108. # Wikify is deleted, the following templates and associated
  109. # categories take over for it:
  110. # {{dead end}}, {{underlinked}}, and {{overlinked}}
  111. # "leadcleanup" refers to "Category:Wikipedia introduction cleanup"
  112. # where amongst others, {{inadequate lead}}, {{lead too short}},
  113. # and {{lead too long}} end up
  114. # Task def is a dictionary where keys are IDs of the span elements
  115. # to which the list of pages will go. Values are one of:
  116. # 1: unicode string, name of category to grab pages from
  117. # 2: list of unicode strings, names of categories,
  118. # pages will be grabbed randomly from all categories combined
  119. #
  120. # The name of a category can also be a tuple of the form
  121. # ("use-subs", u"[category name]") which will indicate that
  122. # we need to grab all sub-categories from the given category.
  123. # Pages will then be grabbed randomly from all the sub-categories.
  124. self.taskDef = {
  125. "wikify": [u"All dead-end pages",
  126. u"All articles with too few wikilinks",
  127. u"All articles with too many wikilinks"],
  128. "leadcleanup": ("use-subs", "Wikipedia introduction cleanup"),
  129. "copyedit": u"All articles needing copy edit",
  130. "update": u"All Wikipedia articles in need of updating",
  131. "translate": u"Wikipedia articles needing cleanup after translation",
  132. "verify": u"All pages needing factual verification",
  133. "or" : u"All articles that may contain original research",
  134. "stub" : u"Stub categories",
  135. # "merge": u"All articles to be merged",
  136. # "split": u"All articles to be split",
  137. # "expand" : u"All articles to be expanded",
  138. # "npov": u"All NPOV disputes",
  139. # "cleanup": u"All pages needing cleanup",
  140. # "style" : u"All articles needing style editing",
  141. # "orphan": u"All orphaned articles",
  142. # "afdrelist" : {
  143. # "catname": u"Relisted AfD debates",
  144. # "pattern": u"Articles_for_deletion/%", # filter
  145. # "exclude": u"%/Log/%", # remove these
  146. # "namespace": 4, # namespace of pages we're looking for
  147. # "prefix": u"Wikipedia:", # namespace prefix
  148. # },
  149. };
  150. self.testRun = testRun;
  151. self.samplingFactor = samplingFactor;
  152. self.verbose = verbose;
  153. self.dbConn = None;
  154. self.dbCursor = None;
  155. self.maxDBQueryAttempts = maxDBQueryAttempts;
  156. self.logDBHost = 'sql-user-n';
  157. self.logDBName = 'u_nettrom';
  158. self.logTableName = 'u_nettrom_opentask_log';
  159. if logDBHost:
  160. self.logDBHost = logDBHost;
  161. if logDBName:
  162. self.logDBName = logDBName;
  163. if logTableName:
  164. self.logTableName = logTableName;
  165. self.popQualServer = None;
  166. if self.samplingFactor > 1:
  167. # Read classifier configuration from file,
  168. # and instantiate Popularity/Quality server
  169. classifierFilename = "~/SuggestBot/classifier/hostname.txt";
  170. if classifierFile:
  171. classifierFilename = classifierFile;
  172. pqServerConfig = DummyConfig();
  173. with(open(os.path.expanduser(classifierFilename))) as inputFile:
  174. hostname = inputFile.readline().strip();
  175. port = int(inputFile.readline().strip());
  176. pqServerConfig.setConfig(key="CLASSIFIER_HOSTNAME",
  177. value=hostname);
  178. pqServerConfig.setConfig(key="CLASSIFIER_HOSTPORT",
  179. value=port);
  180. self.popQualServer = PopQual.PopularityQualityServer(config=pqServerConfig);
  181. # Dictionary of results, a list of pages for each task
  182. self.foundTasks = dict([(taskId, []) for taskId in self.taskDef.keys()]);
  183. # Query to fetch a number of random pages from a given category.
  184. self.randomPageQuery = r"""SELECT /* LIMIT:120 */
  185. page_id, page_title
  186. FROM page JOIN categorylinks ON page_id=cl_from
  187. WHERE cl_to=?
  188. AND page_namespace=?
  189. AND page_random >= RAND()
  190. ORDER BY page_random LIMIT ?""";
  191. # Query to fetch all pages in a given namespace from a given category
  192. self.getAllPagesQuery = u"""SELECT page_id, page_title
  193. FROM page JOIN categorylinks ON page_id=cl_from
  194. WHERE cl_to=?
  195. AND page_namespace=?""";
  196. def connectDatabase(self, hostName=None, dbName=None):
  197. '''
  198. Connect to the database associated with our Wikipedia,
  199. or a given server and database if host/database names
  200. are supplied.
  201. @param hostName: hostname of the server we're connecting to
  202. @type hostName: str
  203. @param dbName: name of the database we will be using
  204. @type dbName: str
  205. '''
  206. if not hostName:
  207. hostName = u"{lang}wiki-p.rrdb.toolserver.org".format(lang=self.lang);
  208. dbName = u"{lang}wiki_p".format(lang=self.lang);
  209. if self.dbConn:
  210. self.disconnectDatabase();
  211. try:
  212. self.dbConn = oursql.connect(db=dbName,
  213. host=hostName,
  214. read_default_file=os.path.expanduser(self.mysqlConf),
  215. use_unicode=False,
  216. charset=None);
  217. self.dbCursor = self.dbConn.cursor();
  218. except oursql.Error, e:
  219. sys.stderr.write("Error: Unable to connect to database {0} on server {1}.\n".format(dbName, hostname));
  220. sys.stderr.write("Error {0}: {1}\n".format(e.args[0], e.args[1]));
  221. return False;
  222. # Ok, done
  223. return True;
  224. def disconnectDatabase(self):
  225. if not self.dbConn or not self.dbCursor:
  226. sys.stderr.write(u"Warning: can't disconnect connections that are None!\n".encode('utf-8'));
  227. return False;
  228. try:
  229. self.dbCursor.close();
  230. self.dbConn.close();
  231. except oursql.Error, e:
  232. sys.stderr.write("Error: Unable to disconnect from database!\n");
  233. sys.stderr.write("Error {0}: {1}\n".format(e.args[0], e.args[1]));
  234. return False;
  235. # Ok, done
  236. return True;
  237. def stopme(self):
  238. pywikibot.stopme();
  239. def findAfDs(self, afdDef=None, nPages=5):
  240. """
  241. Find relisted Articles for Deletion (AfDs).
  242. Excepts a working database connection to exist as self.dbConn
  243. @param afdDef: Dictionary defining how to find the relisted AfDs
  244. keys and their mapping:
  245. catname: category where they are listed
  246. pattern: SQL "LIKE" pattern for inclusion
  247. exclude: SQL "LIKE" pattern of exclusion
  248. namespace: ns of the pages we're looking for
  249. prefix: namespace prefix
  250. @type afdDef: dict
  251. @param nPages: number of pages to find
  252. @type nPages: int
  253. """
  254. if not afdDef:
  255. sys.stderr.write(u"Error: cannot find relisted AfDs without the definition of how to find them\n");
  256. return [];
  257. # Query to get pages from the relisted AfD category, matching a given
  258. # pattern, enabling exclusion based on certain titles (e.g. log-pages)
  259. # and limiting to a given namespace
  260. afdPageQuery = r"""SELECT /* LIMIT:120 */
  261. page_id, page_title
  262. FROM page JOIN categorylinks ON page_id=cl_from
  263. WHERE cl_to=?
  264. AND page_title LIKE ?
  265. AND page_title NOT LIKE ?
  266. AND page_namespace=?
  267. AND page_random >= RAND()
  268. ORDER BY page_random
  269. LIMIT ?""";
  270. if self.verbose:
  271. sys.stderr.write("Info: trying to find {n} relisted articles for deletion...\n".format(n=nPages));
  272. foundPages = [];
  273. attempts = 0;
  274. while attempts < self.maxDBQueryAttempts:
  275. try:
  276. dbCursor = self.dbConn.cursor();
  277. dbCursor.execute(afdPageQuery,
  278. (re.sub(" ", "_",
  279. afdDef['catname']),
  280. afdDef['pattern'],
  281. afdDef['exclude'],
  282. afdDef['namespace'],
  283. nPages));
  284. for (pageId, pageTitle) in dbCursor:
  285. foundPages.append(unicode(re.sub('_', ' ', pageTitle),
  286. 'utf-8', errors='strict'));
  287. except oursql.Error, e:
  288. attempts += 1;
  289. sys.stderr.write("Error: Unable to execute query to get relisted AfDs, possibly retrying!\n");
  290. sys.stderr.write("Error {0}: {1}\n".format(e.args[0], e.args[1]));
  291. if e.errno == oursql.errnos['CR_SERVER_GONE_ERROR'] \
  292. or e.errno == oursql.errnos['CR_SERVER_LOST']:
  293. # lost connection, reconnect
  294. self.connectDatabase();
  295. else:
  296. break;
  297. if attempts >= self.maxDBQueryAttempts:
  298. sys.stderr.write("Error: Exhausted number of query attempts, aborting!\n");
  299. return foundPages;
  300. if self.verbose:
  301. sys.stderr.write(u"Info: found {n} relisted AfDs\n".format(n=len(foundPages)));
  302. # OK, done
  303. return foundPages;
  304. def findStubs(self, category=None, nPages=5):
  305. """
  306. Use the database to pick a random stub category,
  307. then pick a sufficient number of pages from that category.
  308. Expects a working database connection to exist as self.dbConn
  309. @param category: The overarching stub category to find a random
  310. category from.
  311. @type category: unicode
  312. @param nPages: number of pages to find
  313. @type nPages: int
  314. """
  315. if not category:
  316. sys.stderr.write(u"Error: unable to find stubs without a seed category\n");
  317. return [];
  318. if self.verbose:
  319. sys.stderr.write("Info: Trying to find {n} stub tasks...\n".format(n=nPages));
  320. foundPages = [];
  321. dbCursor = self.dbConn.cursor();
  322. exitLoop = False;
  323. while len(foundPages) < nPages and not exitLoop:
  324. randStubCategory = None;
  325. attempts = 0;
  326. while attempts < self.maxDBQueryAttempts:
  327. try:
  328. # pick one random stub category (ns = 14)
  329. dbCursor.execute(self.randomPageQuery,
  330. (re.sub(" ", "_", category).encode('utf-8'),
  331. 14, 1));
  332. for (pageId, pageTitle) in dbCursor:
  333. randStubCategory = unicode(pageTitle, 'utf-8', errors='strict');
  334. except oursql.Error, e:
  335. attempts += 1;
  336. sys.stderr.write("Error: Unable to execute query to get a random stub category, possibly retrying!\n");
  337. sys.stderr.write("Error {0}: {1}\n".format(e.args[0], e.args[1]));
  338. if e.errno == oursql.errnos['CR_SERVER_GONE_ERROR'] \
  339. or e.errno == oursql.errnos['CR_SERVER_LOST']:
  340. # lost connection, reconnect
  341. self.connectDatabase();
  342. else:
  343. break;
  344. if not randStubCategory:
  345. # something went wrong
  346. sys.stderr.write("Error: Unable to find random stub category, aborting!\n");
  347. exitLoop = True;
  348. continue;
  349. foundPages.extend(self.findPages(category=randStubCategory,
  350. nPages=nPages));
  351. # truncate to self.numPages
  352. if len(foundPages) > nPages:
  353. foundPages = foundPages[:nPages];
  354. if self.verbose:
  355. sys.stderr.write("Info: Found {n} stub tasks\n".format(n=len(foundPages)));
  356. return foundPages;
  357. def findAllPages(self, category=None):
  358. """
  359. Use the database to fetch all main namespace pages from a given category.
  360. Expects a working database connection to exist as self.dbConn
  361. @param category: Name of the category to grab pages from
  362. @type category: unicode
  363. """
  364. if not category:
  365. sys.stderr.write(u"Error: unable to find pages from a given category without a category name!\n");
  366. return None;
  367. if self.verbose:
  368. sys.stderr.write(u"Info: finding all pages in category {cat}\n".format(cat=category).encode('utf-8'));
  369. attempts = 0;
  370. while attempts < self.maxDBQueryAttempts:
  371. try:
  372. foundPages = [];
  373. dbCursor = self.dbConn.cursor();
  374. dbCursor.execute(self.getAllPagesQuery,
  375. (re.sub(' ', '_', category).encode('utf-8'), # catname
  376. 0) # ns
  377. );
  378. for (pageId, pageTitle) in dbCursor:
  379. foundPages.append(unicode(re.sub('_', ' ', pageTitle),
  380. 'utf-8', errors='strict'));
  381. except oursql.Error, e:
  382. attempts += 1;
  383. sys.stderr.write("Error: Unable to execute query to get pages from this category, possibly retrying!\n");
  384. sys.stderr.write("Error {0}: {1}\n".format(e.args[0], e.args[1]));
  385. if e.errno == oursql.errnos['CR_SERVER_GONE_ERROR'] \
  386. or e.errno == oursql.errnos['CR_SERVER_LOST']:
  387. # lost connection, reconnect
  388. self.connectDatabase();
  389. else:
  390. break;
  391. if attempts >= self.maxDBQueryAttempts:
  392. sys.stderr.write(u"Error: Exhausted number of query attempts!\n");
  393. elif self.verbose:
  394. sys.stderr.write(u"Info: found {n} pages in this category.\n".format(n=len(foundPages)).encode('utf-8'));
  395. return foundPages;
  396. def findSubcategoryPages(self, category=None):
  397. """
  398. Use the database to retrieve all direct descendant sub-categories
  399. of the given category. Then find all pages in all the sub-categories
  400. and return the union of all of them
  401. @param category: Name of the category to grab sub-category pages from
  402. @type category: unicode
  403. """
  404. if not category:
  405. sys.stderr.write(u"Error: unable to find sub-categories in a given category without a category name!\n");
  406. return None;
  407. if self.verbose:
  408. sys.stderr.write(u"Info: finding all pages from direct descendants of category {cat}\n".format(cat=category).encode('utf-8'));
  409. subCategories = [];
  410. attempts = 0;
  411. while attempts < self.maxDBQueryAttempts:
  412. try:
  413. dbCursor = self.dbConn.cursor();
  414. dbCursor.execute(self.getAllPagesQuery,
  415. (re.sub(' ', '_', category).encode('utf-8'), # catname
  416. 14) # ns (14=Category)
  417. );
  418. for (pageId, pageTitle) in dbCursor:
  419. subCategories.append(unicode(re.sub('_', ' ', pageTitle),
  420. 'utf-8', errors='strict'));
  421. except oursql.Error, e:
  422. attempts += 1;
  423. sys.stderr.write("Error: Unable to execute query to get sub-categories from this category, possibly retrying!\n");
  424. sys.stderr.write("Error {0}: {1}\n".format(e.args[0], e.args[1]));
  425. if e.errno == oursql.errnos['CR_SERVER_GONE_ERROR'] \
  426. or e.errno == oursql.errnos['CR_SERVER_LOST']:
  427. # lost connection, reconnect
  428. self.connectDatabase();
  429. else:
  430. break;
  431. if attempts >= self.maxDBQueryAttempts:
  432. sys.stderr.write(u"Error: Exhausted number of query attempts!\n");
  433. return [];
  434. elif self.verbose:
  435. sys.stderr.write(u"Info: found {n} sub-categories in this category.\n".format(n=len(subCategories)).encode('utf-8'));
  436. foundPages = set();
  437. for categoryName in subCategories:
  438. subCatPages = self.findAllPages(category=categoryName);
  439. if subCatPages:
  440. foundPages = foundPages.union(subCatPages);
  441. return foundPages;
  442. def findRandomPages(self, category=None, nPages=5):
  443. """
  444. Use the database to pick a number of pages from a given category.
  445. Expects a working database connection to exist as self.dbConn
  446. @param category: Name of the category to grab pages from
  447. @type category: unicode
  448. @param nPages: number of pages to fetch
  449. @type nPages: int
  450. """
  451. if not category:
  452. sys.stderr.write(u"Error: unable to find pages without a category to pick from\n");
  453. return [];
  454. if self.verbose:
  455. sys.stderr.write(u"Info: finding {n} tasks from category {cat}\n".format(n=nPages, cat=category).encode('utf-8'));
  456. foundPages = [];
  457. attempts = 0;
  458. while attempts < self.maxDBQueryAttempts:
  459. try:
  460. dbCursor = self.dbConn.cursor();
  461. dbCursor.execute(self.randomPageQuery,
  462. (re.sub(' ', '_', category).encode('utf-8'), # catname
  463. 0, # ns
  464. nPages) # n pages
  465. );
  466. for (pageId, pageTitle) in dbCursor:
  467. foundPages.append(unicode(re.sub('_', ' ', pageTitle),
  468. 'utf-8', errors='strict'));
  469. except oursql.Error, e:
  470. attempts += 1;
  471. sys.stderr.write("Error: Unable to execute query to get pages from this category, possibly retrying!\n");
  472. sys.stderr.write("Error {0}: {1}\n".format(e.args[0], e.args[1]));
  473. if e.errno == oursql.errnos['CR_SERVER_GONE_ERROR'] \
  474. or e.errno == oursql.errnos['CR_SERVER_LOST']:
  475. # lost connection, reconnect
  476. self.connectDatabase();
  477. else:
  478. break;
  479. if attempts >= self.maxDBQueryAttempts:
  480. sys.stderr.write(u"Error: Exhausted number of query attempts!\n");
  481. elif self.verbose:
  482. sys.stderr.write(u"Info: found {n} tasks from this category.\n".format(n=len(foundPages)).encode('utf-8'));
  483. return foundPages;
  484. def findPages(self, category=None, nPages=5):
  485. """
  486. Pick a number of pages from a given category definition through
  487. sub-methods that access the database.
  488. @param category: Category-definition of where to grab pages from
  489. @type category: unicode
  490. @param nPages: number of pages to fetch
  491. @type nPages: int
  492. """
  493. if not category:
  494. sys.stderr.write(u"Error: unable to find pages without a category defition to pick from\n");
  495. return [];
  496. if isinstance(category, unicode):
  497. return self.findRandomPages(category=category,
  498. nPages=nPages);
  499. else:
  500. # Create a set of all pages we find,
  501. # from which we'll randomly sample.
  502. foundPages = set();
  503. if isinstance(category, list):
  504. for catName in category:
  505. if isinstance(catName, unicode):
  506. foundPages = foundPages.union(self.findAllPages(category=catName));
  507. elif isinstance(catName, tuple):
  508. # Category name is the second element
  509. foundPages = foundPages.union(self.findSubcategoryPages(category=catName[1]));
  510. elif isinstance(category, tuple):
  511. # Category name is the second element
  512. foundPages = self.findSubcategoryPages(category=category[1]);
  513. try:
  514. # OK, return a random sample of size nPages:
  515. return random.sample(foundPages, nPages);
  516. except ValueError:
  517. # Might happen if we have too few pages to sample,
  518. # return the whole set.
  519. foundPages;
  520. def update(self):
  521. """
  522. Update the list of open tasks.
  523. """
  524. # Query used to log data in the database upon successful update
  525. # of the opentask page
  526. logEntryQuery = u"""INSERT INTO {tablename}
  527. (page_selected, page_title, page_len, task_category,
  528. assessed_class, predicted_class, quality,
  529. popcount, popularity, strategy)
  530. VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""".format(tablename=self.logTableName);
  531. # connect to the wiki and log in
  532. if self.verbose:
  533. sys.stderr.write("Info: connecting to {lang}wiki\n".format(lang=self.lang));
  534. wikiSite = pywikibot.getSite(self.lang);
  535. wikiSite.login();
  536. # Did we log in?
  537. if wikiSite.username() is None:
  538. sys.stderr.write("Error: failed to log in correctly, aborting!\n");
  539. return False;
  540. # connect to the database
  541. if self.verbose:
  542. sys.stderr.write("Info: Connecting to database\n");
  543. if not self.connectDatabase():
  544. sys.stderr.write("Error: failed to connect to database, aborting!\n");
  545. return False;
  546. # Are we oversampling?
  547. numPages = self.numPages;
  548. if self.samplingFactor > 1:
  549. numPages *= self.samplingFactor;
  550. # Lets deal with stubs first, where we'll pick random stub categories
  551. # until we have enough (self.numPages) pages from those
  552. if self.verbose:
  553. sys.stderr.write("Info: Finding stub tasks...\n");
  554. self.foundTasks['stub'] = self.findStubs(category=self.taskDef['stub'],
  555. nPages=numPages);
  556. if self.verbose:
  557. sys.stderr.write("Info: Done finding stub tasks\n");
  558. # Handle relisted AfDs, they use a slightly different query
  559. if "afdrelist" in self.taskDef:
  560. if self.verbose:
  561. sys.stderr.write("Info: fetching relisted articles for deletion...\n");
  562. self.foundTasks['afdrelist'] = self.findAfDs(afdDef=self.taskDef['afdrelist'],
  563. nPages=numPages);
  564. if self.verbose:
  565. sys.stderr.write(u"Info: done fetching relisted AfDs\n");
  566. # Now, for all the other categories...
  567. for (taskId, taskCategory) in self.taskDef.iteritems():
  568. if taskId == 'stub' \
  569. or taskId == 'afdrelist':
  570. # already done...
  571. continue;
  572. if self.verbose:
  573. sys.stderr.write(u"Info: finding tasks for id {id} from category {cat}\n".format(id=taskId, cat=taskCategory).encode('utf-8'));
  574. self.foundTasks[taskId] = self.findPages(category=taskCategory,
  575. nPages=numPages);
  576. if self.verbose:
  577. sys.stderr.write("Info: Find complete, found {n} pages in this category\n".format(n=len(self.foundTasks[taskId])));
  578. # The data that we want to log about the selected pages,
  579. # populated as we go through the,
  580. logEntries = [];
  581. # Go through the found tasks and turn the list of page titles
  582. # into a unicode string, we write an unordered list (*)
  583. # where each list item is a link to a given page
  584. for (taskId, pageList) in self.foundTasks.iteritems():
  585. if not pageList:
  586. self.foundTasks[taskId] = u"None";
  587. # Add one log entry for this category with no pages.
  588. logEntries.append({'taskcategory': taskId,
  589. 'title': None,
  590. 'length': None,
  591. 'strategy': None,
  592. 'popcount': None,
  593. 'popularity:': None,
  594. 'quality': None,
  595. 'predclass': None});
  596. else:
  597. if taskId == "afdrelist":
  598. # Switch SQL LIKE-pattern into a regex we can use
  599. # to strip that from the page title
  600. stripPattern = u"";
  601. pattern = self.taskDef['afdrelist']['pattern'];
  602. if pattern: # more than ""?
  603. stripPattern = re.sub('%', "", pattern);
  604. stripPattern = re.sub("_", " ", stripPattern);
  605. # If we oversampled, reduce through pop/qual
  606. if self.samplingFactor > 1:
  607. pageData = self.selectSubset(pageList=pageList,
  608. nPages=self.numPages,
  609. replacePattern=stripPattern);
  610. for (title, metadata) in pageData.iteritems():
  611. logData = {'taskcategory': taskId,
  612. 'title': title,
  613. 'length': metadata['pagedata']['length'],
  614. 'strategy': metadata['strategy'],
  615. 'popcount': metadata['pagedata']['popcount'],
  616. 'popularity': metadata['pagedata']['popularity'],
  617. 'assessedclass': metadata['pagedata']['quality'],
  618. 'quality': metadata['pagedata']['prediction'],
  619. 'predclass': metadata['pagedata']['predclass']};
  620. logEntries.append(logData);
  621. # Recreate the page list
  622. pageList = pageData.keys();
  623. else:
  624. # Add log entries with the right title
  625. # and None-values for the metadata
  626. for pageTitle in pageList:
  627. logEntries.append({'taskcategory': taskId,
  628. 'title': pageTitle,
  629. 'length': None,
  630. 'strategy': None,
  631. 'popcount': None,
  632. 'popularity:': None,
  633. 'assessedclass': None,
  634. 'quality': None,
  635. 'predclass': None});
  636. # Build all the links manually
  637. self.foundTasks[taskId] = u"\n".join([u"* [[{prefix}{fulltitle}|{linktitle}]]".format(prefix=self.taskDef['afdrelist']['prefix'], fulltitle=page, linktitle=re.sub(stripPattern, u"", page)) for page in pageList]);
  638. else:
  639. # If we oversampled, reduce through pop/qual
  640. if self.samplingFactor > 1:
  641. pageData = self.selectSubset(pageList=pageList,
  642. nPages=self.numPages);
  643. for (title, metadata) in pageData.iteritems():
  644. logData = {'taskcategory': taskId,
  645. 'title': title,
  646. 'length': metadata['pagedata']['length'],
  647. 'strategy': metadata['strategy'],
  648. 'popcount': metadata['pagedata']['popcount'],
  649. 'popularity': metadata['pagedata']['popularity'],
  650. 'assessedclass': metadata['pagedata']['quality'],
  651. 'quality': metadata['pagedata']['prediction'],
  652. 'predclass': metadata['pagedata']['predclass']};
  653. logEntries.append(logData);
  654. # Recreate the page list
  655. pageList = pageData.keys();
  656. else:
  657. # Add log entries with the right title
  658. # and None-values for the metadata
  659. for pageTitle in pageList:
  660. logEntries.append({'taskcategory': taskId,
  661. 'title': pageTitle,
  662. 'length': None,
  663. 'strategy': None,
  664. 'popcount': None,
  665. 'popularity:': None,
  666. 'assessedclass': None,
  667. 'quality': None,
  668. 'predclass': None});
  669. self.foundTasks[taskId] = u"\n".join([u"* {title}".format(title=pywikibot.Page(wikiSite, page).title(asLink=True)) for page in pageList]);
  670. if self.verbose:
  671. sys.stderr.write(u"Info: Turned page titles into page links, getting wikitext of page {taskpage}\n".format(taskpage=self.taskPage).encode('utf-8'));
  672. tasktext = None;
  673. try:
  674. taskpage = pywikibot.Page(wikiSite, self.taskPage);
  675. tasktext = taskpage.get();
  676. except pywikibot.exceptions.NoPage:
  677. sys.stderr.write(u"Warning: Task page {title} does not exist, aborting!\n".format(title=self.taskPage).encode('utf-8'));
  678. except pywikibot.exceptions.IsRedirectPage:
  679. sys.stderr.write(u"Warning: Task page {title} is a redirect, aborting!\n".format(title=self.taskPage).encode('utf-8'));
  680. except pywikibot.data.api.TimeoutError:
  681. sys.stderr.write(u"Error: API request to {lang}-WP timed out, unable to get wikitext of {title}, cannot continue!\n".format(lang=self.lang, title=self.taskPage));
  682. if tasktext is None:
  683. return False;
  684. if self.verbose:
  685. sys.stderr.write(u"Info: got wikitext, substituting page lists...\n");
  686. for (taskId, pageList) in self.foundTasks.iteritems():
  687. # note: using re.DOTALL because we need .*? to match \n
  688. # since our content is a list
  689. tasktext = re.sub(ur'<span id="{taskid}">(.*?)</span>'.format(taskid=taskId),
  690. ur'<span id="{taskid}">\n{pagelist}</span>'.format(taskid=taskId, pagelist=pageList),
  691. tasktext, flags=re.DOTALL);
  692. if self.testRun:
  693. sys.stderr.write(u"Info: Running a test, printing out new wikitext:\n\n");
  694. print tasktext.encode('utf-8');
  695. else:
  696. if self.verbose:
  697. sys.stderr.write(u"Info: Saving page with new text\n");
  698. taskpage.text = tasktext;
  699. try:
  700. taskpage.save(comment=self.editComment);
  701. except pywikibot.exceptions.EditConflict:
  702. sys.stderr.write(u"Error: Saving page {title} failed, edit conflict.\n".format(title=self.taskPage).encode('utf-8'));
  703. return False;
  704. except pywikibot.exceptions.PageNotSaved as e:
  705. sys.stderr.write(u"Error: Saving page {title} failed.\nError: {etext}\n".format(title=self.taskPage, etext=e).encode('utf-8'));
  706. return False;
  707. except pywikibot.data.api.TimeoutError:
  708. sys.stderr.write(u"Error: Saving page {title} failed, API request timeout fatal\n".format(title=self.taskPage).encode('utf-8'));
  709. return False
  710. else:
  711. # Everything went OK, switch connection to the SQL server
  712. # used for logging.
  713. if not self.connectDatabase(hostName=self.logDBHost,
  714. dbName=self.logDBName):
  715. sys.stderr.write(u"Error: Unable to connect to DB server {server} using DB {database} for logging\n".format(server=self.logDBHost, database=self.logDBName));
  716. else:
  717. timestamp = pywikibot.Timestamp.fromISOformat(taskpage.editTime());
  718. with self.dbConn.cursor() as dbCursor:
  719. for logData in logEntries:
  720. try:
  721. if logData['title']:
  722. logData['title'] = logData['title'].encode('utf-8');
  723. dbCursor.execute(logEntryQuery,
  724. (timestamp,
  725. logData['title'],
  726. logData['length'],
  727. logData['taskcategory'],
  728. logData['assessedclass'],
  729. logData['predclass'],
  730. logData['quality'],
  731. logData['popcount'],
  732. logData['popularity'],
  733. logData['strategy']));
  734. # NOTE: Consider catching something else than oursql.Error,
  735. # that also catches warnings.
  736. except oursql.Error, e:
  737. sys.stderr.write("Error: Unable to insert log entry!\n");
  738. sys.stderr.write("Error {0}: {1}\n".format(e.args[0], e.args[1]));
  739. # OK, all done
  740. if self.verbose:
  741. sys.stderr.write("Info: List of open tasks successfully updated!\n");
  742. if not self.disconnectDatabase():
  743. sys.stderr.write(u"Warning: Unable to cleanly disconnect from the database!\n");
  744. return True;
  745. def selectSubset(self, pageList=[], nPages=5, replacePattern=None):
  746. """
  747. Expects a list of pages of length greater than self.numPages, from
  748. which we will pick self.numPages pages based on some criteria.
  749. @param pageList: page titles we'll want to select a subset from
  750. @type pageList: list
  751. @param nPages: number of pages we want to end up with
  752. @type nPages: int
  753. @param replacePattern: regular expression pattern for replacement,
  754. used to strip page titles so we correctly
  755. inspect an associated page (e.g. for AfDs)
  756. @type replacePattern: unicode
  757. """
  758. pageMapping = {};
  759. popQualData = [];
  760. # number of picks from each non-random selection
  761. nNonRandom = nPages/5;
  762. sortedPages = {};
  763. # The Pop/Qual server takes care of gathering popularity and quality
  764. # data as efficiently as possible for our list of pages.
  765. if replacePattern:
  766. # Map replaced page titles to original titles.
  767. for pageTitle in pageList:
  768. replacedTitle = re.sub(replacePattern, u"", pageTitle);
  769. pageMapping[replacedTitle] = pageTitle;
  770. popQualData = self.popQualServer.getPopQualList(pageMapping.keys());
  771. else:
  772. popQualData = self.popQualServer.getPopQualList(pageList);
  773. # Now, how to actually select the pages?
  774. # 0: keep pages for which we have data
  775. popQualData = [pageData for pageData in popQualData \
  776. if pageData['status'] == 200 \
  777. and pageData['pred-numeric'] > 0];
  778. # 1: sort by popularity, high to low
  779. sortedPages['highpop'] = sorted(popQualData,
  780. key=lambda pageData: pageData['popcount'],
  781. reverse=True);
  782. # 2: sort by quality, high to low
  783. sortedPages['highqual'] = sorted(popQualData,
  784. key=lambda pageData: pageData['pred-numeric'],
  785. reverse=True);
  786. # 3: sort by quality, low to high
  787. # (most likely correlated with popularity, so we don't need both)
  788. sortedPages['lowqual'] = sorted(popQualData,
  789. key=lambda pageData: pageData['pred-numeric']);
  790. # 4: sort by discrepancy between popularity and quality
  791. # (since we already sorted by high popularity, we just sort it again by quality (low to high))
  792. sortedPages['maxlove'] = sorted(sortedPages['highpop'],
  793. key=lambda pageData: pageData['pred-numeric']);
  794. # randomise the strategies
  795. strategies = sortedPages.keys();
  796. random.shuffle(strategies);
  797. selectedPages = {};
  798. # Pick pages from the non-random strategies
  799. for strategy in strategies:
  800. i = 0;
  801. nSelected = 0;
  802. while nSelected < nNonRandom and i < len(sortedPages[strategy]):
  803. pageTitle = sortedPages[strategy][i]['title'];
  804. try:
  805. x = selectedPages[pageTitle];
  806. except KeyError:
  807. selectedPages[pageTitle] = {'strategy': strategy,
  808. 'pagedata': sortedPages[strategy][i]};
  809. nSelected += 1;
  810. i += 1;
  811. # Fill up the rest with randomly picked pages
  812. while len(selectedPages) < nPages:
  813. randPage = random.choice(popQualData);
  814. try:
  815. x = selectedPages[randPage['title']];
  816. except KeyError:
  817. selectedPages[randPage['title']] = {'strategy': 'random',
  818. 'pagedata': randPage};
  819. # if we replaced titles, reverse that after selection
  820. if replacePattern:
  821. replacedTitles = {};
  822. for (pageTitle, pageData) in selectedPages.iteritems():
  823. mappedTitle = pageMapping[pageTitle];
  824. replacedTitles[mappedTitle] = pageData;
  825. selectedPages = replacedTitles;
  826. return selectedPages;
  827. def main():
  828. import argparse;
  829. cli_parser = argparse.ArgumentParser(
  830. description="Program to update list of open tasks for a given Wikipedia."
  831. );
  832. # Option to control the edit comment
  833. cli_parser.add_argument('-c', '--comment', default=None,
  834. help="edit comment to use when saving the new page");
  835. # Option to control the list of tasks
  836. cli_parser.add_argument('-d', '--taskdef', default=None,
  837. help="repr of dictionary mapping task IDs to task categories");
  838. # Option to control where the classifier configuration file is located
  839. cli_parser.add_argument('-f', '--classifier', default=None, metavar="<classifier-path>",
  840. help="path to file with hostname and port of the quality classifier");
  841. # Option to control language
  842. cli_parser.add_argument('-l', '--lang', default=u'en',
  843. help="language code of the Wikipedia we're working on (default: en)");
  844. # Option to control the MySQL configuration file
  845. cli_parser.add_argument('-m', '--mysqlconf', default=None,
  846. help="path to MySQL configuration file");
  847. # Option to control number of pages per category of tasks
  848. cli_parser.add_argument('-n', '--numpages', default=5,
  849. help="number of pages displayed in each task category (default: 5)");
  850. # Option to control the number of oversampled pages
  851. # when selecting based on popularity and quality
  852. cli_parser.add_argument('-o', '--oversample', default=20, type=int,
  853. help="multiplication factor used for oversampling and selection by popularity and quality. (a value of '1' turns it off)");
  854. # Option to control where the list of open tasks are
  855. cli_parser.add_argument('-p', '--page', default=None,
  856. help="title of the page with the open tasks");
  857. # Test option
  858. cli_parser.add_argument('-t', '--test', action='store_true',
  859. help='if set the program does not save the page, writes final wikitext to stdout instead');
  860. # Verbosity option
  861. cli_parser.add_argument('-v', '--verbose', action='store_true',
  862. help='if set informational output is written to stderr');
  863. args = cli_parser.parse_args();
  864. if args.taskdef:
  865. args.taskdef = eval(args.taskdef);
  866. taskUpdater = OpenTaskUpdater(verbose=args.verbose, lang=args.lang,
  867. mysqlConf=args.mysqlconf, taskPage=args.page,
  868. taskDef=args.taskdef, pagesPerCategory=args.numpages,
  869. editComment=args.comment, testRun=args.test,
  870. samplingFactor=args.oversample,
  871. classifierFile=args.classifier);
  872. try:
  873. taskUpdater.update();
  874. finally:
  875. taskUpdater.stopme();
  876. if __name__ == "__main__":
  877. main();