PageRenderTime 52ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/dump/src/main/mediawiki/DBpediaFunctions.php

https://github.com/SunghanKim/extraction-framework
PHP | 514 lines | 455 code | 20 blank | 39 comment | 6 complexity | 4634186f922c15d2e01b1d941a3975ea MD5 | raw file
  1. <?php
  2. /**
  3. * Helper functions to clean up wiki source text for dbpedia.
  4. */
  5. class DBpediaFunctions {
  6. /*
  7. To make sure that Apache sends debug output to the browser,
  8. add the following to index.php:
  9. @apache_setenv('no-gzip', 1);
  10. @ini_set('zlib.output_compression', 0);
  11. @ini_set('implicit_flush', 1);
  12. for ($i = 0; $i < ob_get_level(); $i++) { ob_end_flush(); }
  13. ob_implicit_flush(1);
  14. */
  15. /**
  16. * Should Wiki and HTML code be cleaned? If false, generate HTML for the
  17. * introduction as usual. May be useful for debugging.
  18. */
  19. const CLEAN = true;
  20. private static function removeShortIPA( $matches ) {
  21. return substr_count($matches[1], " ") <= 3 ? "" : $matches[0];
  22. }
  23. private static function replaceTooManyLinks( $matches ) {
  24. $match = $matches[0];
  25. $countLinks = substr_count($match ,"<a ");
  26. if ($countLinks <= 0) return $match;
  27. $match_without_links = preg_replace("~<a(\s+[^>]*)?>.*?</a>~", "", $match);
  28. $matchWords = split(" ", strip_tags($match_without_links));
  29. return $countLinks / sizeof($matchWords) >= 0.25 ? "" : $match;
  30. }
  31. /**
  32. * Get HTML from $wgOut->getHTML(), call self::cleanHtml(), set HTML
  33. * back into $wgOut.
  34. */
  35. static function cleanOutput() {
  36. global $wgOut;
  37. if (! self::CLEAN) return;
  38. // wfProfileIn( __METHOD__ );
  39. $html = $wgOut->getHTML();
  40. $html = self::cleanHtml($html);
  41. $wgOut->clearHTML();
  42. $wgOut->addHTML($html);
  43. // wfProfileOut( __METHOD__ );
  44. }
  45. /**
  46. * Use this line for debugging:
  47. * echo __LINE__, "<p/>\n", $html, "<hr/>\n";
  48. * @param $html the html
  49. */
  50. static function cleanHtml( $html ) {
  51. $html = preg_replace("~<br ?/?>~", "", $html);
  52. // remove map references
  53. // example: http://en.wikipedia.org/w/index.php?title=Attila_the_Hun&oldid=300682051
  54. $html = preg_replace("~\s?\(see map below\)~", "", $html);
  55. // <a href="/wikipedia/index.php/Image:Ltspkr.png" title="Image:Ltspkr.png">Image:Ltspkr.png</a>
  56. $html = preg_replace("~<a\s[^>]*Image:Ltspkr\.png[^>]*>[^<]*</a>~", "", $html);
  57. $html = preg_replace("~<a\s[^>]*File:Loudspeaker\.svg[^>]*>[^<]*</a>~", "", $html);
  58. // [[]]
  59. // TODO: Log warning. Links should have been rendered / removed.
  60. $html = preg_replace("/\[\[[^\]]*?\]\]/", "", $html);
  61. // remove nested round brackets: (x(y)z -> (xz
  62. do {
  63. $in = $html;
  64. $html = preg_replace("~(\([^)(]*)\([^)(]*\)~", "$1", $html);
  65. } while ($html != $in);
  66. // remove nested brackets: (x[y]z -> (xz#
  67. do {
  68. $in = $html;
  69. // TODO: use "~(\([^)[]*)\[[^)]]*\]~" instead
  70. $html = preg_replace("~(\([^)]*)\[[^]]*\]~", "$1", $html);
  71. } while ($html != $in);
  72. // delete parentheses with too many links
  73. $html = preg_replace_callback("/\s?\(.*?\)/", 'DBpediaFunctions::replaceTooManyLinks', $html);
  74. // <a href="http://www.britannica.com/EBchecked/topic/509710/rose" class="external autonumber" title="http://www.britannica.com/EBchecked/topic/509710/rose" rel="nofollow">[1]</a>
  75. $html = preg_replace("~<a[^>]*class=\"external autonumber[^>]*>.*?</a>~", "", $html);
  76. $html = self::replaceAudioSpans($html);
  77. // <sup id="cite_ref-0" class="reference"><a href="#cite_note-0" title="">[1]</a></sup>
  78. $html = preg_replace("~<sup(\s+[^>]*)?>.*?</sup>~", "", $html);
  79. $html = preg_replace("~<small(\s+[^>]*)?>.*?</small>~", "", $html);
  80. // (pronounced <span title="Pronunciation in the International Phonetic Alphabet (IPA)" class="IPA">
  81. // <span class="IPA
  82. $html = preg_replace("~\([^)]*<span [^>]*class=\"[^\"]*IPA[^\"]*\"[^>]*>.*?</span>~", "(", $html);
  83. // <a href="/wikipedia/index.php/Help:Pronunciation" class="mw-redirect" title="Help:Pronunciation">pronounced</a> <span class="IP
  84. $html = preg_replace("~\(<a href[^>]*>.*?</a> <span [^>]*class=\"[^\"]*IPA[^\"]*\"[^>]*>.*?</span>~", "(", $html);
  85. // <a href="/wikipedia/index.php?title=Special:Upload&amp;wpDestFile=En-us-Denmark.ogg" class="new" title="En-us-Denmark.ogg">[?d?nm?rk]</a></span>
  86. $html = preg_replace("~<a .*?>\[(.*?)\]</a>~", "", $html);
  87. $html = preg_replace("~<a .*?>\[?(.*?)\]?</a>~", "$1", $html);
  88. //<strong class="error">
  89. $html = preg_replace("~<strong class=\"error\">.*?</strong>~", "", $html);
  90. $html = strip_tags($html);
  91. $html = str_replace("&nbsp;", " ", $html);
  92. $html = str_replace("&#32;", " ", $html);
  93. $html = str_replace("&#x20;", " ", $html);
  94. // TODO: why is it necessary to remove '//'? And why escape '/'?
  95. $html = str_replace("\/\/", "", $html);
  96. $html = self::removeNonsense($html);
  97. // remove " ' '" before ")" TODO: why?
  98. $html = preg_replace("/[\s]*'[\s]*'\)/", ")", $html);
  99. // remove "' ' " after "(" TODO: why?
  100. $html = preg_replace("/\('[\s]*'[\s]*/", "(", $html);
  101. // remove "xyz:,;..." after "("
  102. // example: (Sinhalese:, ... )
  103. $html = preg_replace("/\([^\s\)]*?:[,;]+/", "(", $html);
  104. // remove "(xyz: ')" and "(xyz: )"
  105. // example: (Arabic: ')
  106. // not necessary - we remove brackets containing colons below
  107. // $html = preg_replace("/\([^\s\)]*: '?\)/", "", $html);
  108. $html = self::removeNonsense($html);
  109. // remove "(... ')"
  110. $html = preg_replace("/\([^\s\)]* '\)/", "", $html);
  111. // remove "(' ...)"
  112. $html = preg_replace("/\(' [^\s\)]*\)/", "", $html);
  113. $html = self::removeNonsense($html);
  114. $html = str_replace("\n", " ", $html);
  115. $html = trim($html);
  116. //AUDIO stuff:
  117. // , IPA:?t???si:,
  118. $html = preg_replace_callback("/,\s*IPA:(.*?),/s", 'DBpediaFunctions::removeShortIPA', $html);
  119. // remove brackets containing colons.
  120. $html = preg_replace("/\([^\)]*?:[^\)]*?\)/s", "", $html);
  121. $html = self::removeNonsense($html);
  122. // if there's a closing bracket before an opening bracket, remove it.
  123. if (strpos($html, ")") < strpos($html, "(")) {
  124. $html = preg_replace('/\)/', '', $html, 1);
  125. }
  126. // Todo: what about "3.4 % of"?
  127. $html = preg_replace("/\.([^0-9.][^0-9.])/", ". $1", $html);
  128. $html = preg_replace("/ +/", " ", $html);
  129. return $html;
  130. }
  131. private static function removeNonsense($html) {
  132. //Remove spaces before commas and semicolons
  133. $html = preg_replace("/\s+,/", ",", $html);
  134. $html = preg_replace("/\s+;/", ";", $html);
  135. // TODO: when can this happen?
  136. // $html = preg_replace("/[;|,]..[;|,]/", ",", $html);
  137. //Remove spaces near brackets
  138. $html = preg_replace("~\(\s*~", "(", $html);
  139. $html = preg_replace("~\s*\)~", ")", $html);
  140. //Remove commas etc. near brackets
  141. $html = preg_replace("~\(([;,:]\s*)*~", "(", $html);
  142. $html = preg_replace("~(\s*[;,:])*\)~", ")", $html);
  143. //Remove empty brackets
  144. $html = preg_replace("/\[[\s]*\]/", "", $html);
  145. $html = preg_replace("/\([\s]*\)/", "", $html);
  146. return $html;
  147. }
  148. /**
  149. * TODO: Log if the following matches. It's probably no longer needed.
  150. */
  151. private static function replaceAudioSpans( $html ) {
  152. // <p><span class="unicode audiolink"><a href="http://upload.wikimedia.org/wikipedia/commons/b/b4/Stavanger.ogg" class="internal" title="Stavanger.ogg"><b>Stavanger</b></a></span>
  153. if ((strpos($html, "<span class=\"unicode audiolink\">") === 0) || (strpos($html, "<b><span class=\"unicode audiolink\">") === 0)) {
  154. /*
  155. $html = preg_replace("~<span class=\"unicode audiolink\"><a .*?>(.*?)</a></span>~s", "$1", $html);
  156. */
  157. $html = preg_replace("~<span class=\"unicode audiolink\">(.*?)</span>~s", "$1", $html);
  158. } else {
  159. //<span class="unicode audiolink"><a href="/wikipedia/index.php?title=Special:Upload&amp;wpDestFile=Eugen_Berthold_Friedrich_Brecht.ogg" class="new" title="Eugen Berthold Friedrich Brecht.ogg"><b>Eugen Berthold Friedrich Brecht</b></a></span>&nbsp;<span class="metadata audiolinkinfo">>(<a href="/wikipedia/index.php/Wikipedia:Media_help" title="Wikipedia:Media help">help</a>ďż˝<a href="/wikipedia/index.php?title=Image:Eugen_Berthold_Friedrich_Brecht.ogg&amp;action=edit&amp;redlink=1" class="new" title="Image:Eugen Berthold Friedrich Brecht.ogg (page does not exist)">info</a>)</small></span>
  160. $html = preg_replace("~<span class=\"unicode audiolink\"><a [^>]*><b>(.*?)</b></a></span>~s", "$1", $html);
  161. // <span class="unicode audiolink"><a href="/wikipedia/index.php?title=Special:Upload&amp;wpDestFile=Anime.ogg" class="new" title="Anime.ogg"><i>listen</i></a></span>&nbsp;
  162. $html = preg_replace("~<span class=\"unicode audiolink\">.*?</span>~s", "", $html);
  163. // replace audio spans in parentheses
  164. // (<span class="unicode" style="white-space: nowrap;"> <a href="/wikipedia_en/index.php?title=Special:Upload&amp;wpDestFile=It-Leonardo_di_ser_Piero_da_Vinci.ogg" class="new" title="It-Leonardo di ser Piero da Vinci.ogg">pronunciation</a> </span>
  165. while (preg_match("~\(([^)]*?)<span\sclass=\"unicode\"[^>]*>.*?</span>~s", $html)) {
  166. $html = preg_replace("~\(([^)]*?)<span\sclass=\"unicode\"[^>]*>.*?</span>~s", "($1", $html);
  167. }
  168. }
  169. // TODO: this shouldn't be replaced when it is at the beginning of the text
  170. //<span class="metadata audiolinkinfo"><small>(<a href="/wikipedia/index.php/Wikipedia:Media_help" title="Wikipedia:Media help">help</a>ďż˝
  171. $html = preg_replace("~<span class=\"metadata audiolinkinfo\">.*?</span>~s", "", $html);
  172. $html = preg_replace("~<span class=\"audiolinkinfo\">.*?</span>~s", "", $html);
  173. $html = preg_replace("~<a href=\"\/wikipedia\/index\.php\/Datei:Loudspeaker\.svg\" title=\"Datei:Loudspeaker.svg\">Datei:Loudspeaker\.svg<\/a>~", "", $html);
  174. // echo $html."<hr>";
  175. return $html;
  176. }
  177. /**
  178. * Use this line for debugging:
  179. * echo __LINE__, "<p/>\n", $wiki, "<hr/>\n";
  180. * @param $wiki the wiki source
  181. * @return a clean and short version of the given wiki source
  182. */
  183. static function getAbstract( $wiki ) {
  184. //wfProfileIn( __METHOD__ );
  185. $section = self::getFirstSection($wiki);
  186. //echo str_replace("\n", "<br>", $section) . "<br>---</br>";
  187. if (self::CLEAN) $section = self::cleanWikiText($section);
  188. // The first occurrence of a bold word probably distinguishes the introductory section.
  189. $section = self::getBoldSection($section);
  190. //wfProfileOut( __METHOD__ );
  191. return $section;
  192. }
  193. // only check the first three sections
  194. const MAX_SECTIONS = 3;
  195. /**
  196. * The first occurrence of a bold word probably distinguishes the introductory section.
  197. */
  198. private static function getFirstSection( $wiki ) {
  199. // remove comments
  200. $wiki = preg_replace('~ ?<!--.*?-->~s','', $wiki);
  201. // echo $wiki."<hr>";
  202. // split text into sections: separated by headings or horizontal lines
  203. $sections = preg_split("~(^=.*=\s*$|^----.*$)~m", $wiki, self::MAX_SECTIONS + 1, PREG_SPLIT_NO_EMPTY);
  204. $first_sections = "";
  205. foreach ($sections as $si => $section) {
  206. if ($si === self::MAX_SECTIONS) break;
  207. if (strpos($section, "'''") !== false) {
  208. // echo "<h1>FOUND ''' IN</H1>" . $section . "<hr>";
  209. $parts = explode("'''", $section);
  210. $passed_parts = "";
  211. foreach ($parts as $part) {
  212. $passed_parts .= $part;
  213. $first_section_plus_part = $first_sections . $passed_parts;
  214. $count_open_brackets = substr_count($first_section_plus_part, "{{");
  215. $count_closing_brackets = substr_count($first_section_plus_part, "}}");
  216. if ($count_open_brackets <= $count_closing_brackets) {
  217. // echo "<h1>FOUND IN</H1>" . $first_sections . $section . "<hr>";
  218. return $first_sections . $section;
  219. }
  220. }
  221. }
  222. $first_sections .= $section;
  223. }
  224. // No bold word? Just use the first section. TODO: improve the heuristic.
  225. return strlen($sections[0]) >= 20 ? $sections[0] : "";
  226. }
  227. /**
  228. * The first occurrence of a bold word probably distinguishes the introductory section.
  229. */
  230. private static function getBoldSection( $wiki ) {
  231. // split text into sections: separated by headings or horizontal lines
  232. $sections = preg_split("~(^=.*=\s*$|^----.*$)~m", $wiki, self::MAX_SECTIONS + 1, PREG_SPLIT_NO_EMPTY);
  233. foreach ($sections as $si => $section) {
  234. if ($si === self::MAX_SECTIONS) break;
  235. // split section into paragraphs: separated by empty lines
  236. $paras = preg_split("~\n\s*\n~", $section, -1, PREG_SPLIT_NO_EMPTY);
  237. foreach ($paras as $pi => $para) {
  238. // remove all paragraphs before the one containing the bold word
  239. if (strpos($para, "'''") !== false) {
  240. //echo "BOLD SECTION!!! ".$section;
  241. return $pi === 0 ? $section : implode(' ', array_slice($paras, $pi));
  242. }
  243. }
  244. }
  245. // No bold word? Just use the first section. TODO: improve the heuristic.
  246. //echo "SECTION: ".$sections[0]."<hr>";
  247. return strlen($sections[0]) >= 20 ? $sections[0] : "";
  248. }
  249. private static function cleanWikiText( $wiki ) {
  250. /*
  251. $count_open_brackets = substr_count($wiki, "{{");
  252. $count_closing_brackets = substr_count($wiki, "}}");
  253. if ($count_open_brackets > $count_closing_brackets) {
  254. // If template brackets don't match - which is the case in roughly
  255. // 1000 articles of 3 mio - the recursive regex in cleanTemplates()
  256. // may crash our process with a stack overflow. To avoid this, simply
  257. // don't extract an abstract for this page.
  258. // return "";
  259. }
  260. */
  261. $wiki = str_replace("\r\n", "\n", $wiki);
  262. $wiki = str_replace("\r", "\n", $wiki);
  263. // remove comments
  264. // moved to getBoldSection $wiki = preg_replace('~ ?<!--.*?-->~s','', $wiki);
  265. // remove tables
  266. // was: $wiki = preg_replace('~{\|.*?\|}~s','', $wiki);
  267. $wiki = preg_replace('~^[\s|:]*\{\|~m', "12345654321", $wiki);
  268. $wiki = preg_replace('~^[\s|:]*\|\}(?!\})~m', "98765456789", $wiki);
  269. // echo $wiki . "<hr>";
  270. $wiki = self::cleanTemplates($wiki);
  271. // echo $wiki . "<hr>";
  272. $wiki = str_replace("12345654321", "{|", $wiki);
  273. $wiki = str_replace("98765456789", "|}", $wiki);
  274. // echo $wiki . "<hr>";
  275. $wiki_array = explode("\n", $wiki);
  276. //array_walk($wiki_array, create_function('&$temp', $temp = trim($temp)));
  277. $wiki = "";
  278. $c = 0;
  279. foreach ($wiki_array as $line) {
  280. $tLine = ltrim($line);
  281. if ((strlen($tLine) >= 2) && (strncmp($tLine, "{|", 2) == 0)) {
  282. $c += 1;
  283. } else if (($c > 0) && (strlen($tLine) >= 2) && (strncmp($tLine, "|}", 2) == 0)) {
  284. $c -= 1;
  285. if ($c == 0 && strlen($tLine) >= 3) $wiki .= substr($tline, 2) . "\n";
  286. } else if ($c == 0) $wiki .= $line . "\n";
  287. }
  288. /*
  289. do {
  290. $in = $wiki;
  291. if (strpos($wiki, "{|") !== false) {
  292. $wiki = preg_replace('~\{\|((?!\{\||\|\}).)*\|\}~s', '', $wiki);
  293. }
  294. } while ($wiki != $in);
  295. */
  296. // clean <math> </math>
  297. $wiki = preg_replace("~<math(\s+[^>/]*)?>.*?</math>~s", "", $wiki);
  298. // clean <imagemap>
  299. $wiki = preg_replace("~<imagemap(\s+[^>/]*)?>.*?</imagemap>~s", "", $wiki);
  300. // clean <gallery>
  301. $wiki = preg_replace("~<gallery(\s+[^>/]*)?>.*?</gallery>~s", "", $wiki);
  302. // first clean references, then clean links (there might be special links with refs as parameters)
  303. $wiki = self::cleanRefs($wiki);
  304. $wiki = self::cleanLinks($wiki);
  305. return $wiki;
  306. }
  307. /**
  308. * Remove some templates: 'otheruses', 'TOC', 'Unreferenced',
  309. * 'Audio',
  310. * and templates that contain line breaks or are in a separate line.
  311. * @param $wiki the wiki source
  312. * @return the given wiki source minus some templates
  313. */
  314. private static function cleanTemplates( $wiki ) {
  315. //Remove single curly braces
  316. $wiki = preg_replace('/(?<!\{)\{(?!\{)/s', "$1($2", $wiki);
  317. $wiki = preg_replace('/(?<!\})\}(?!\})/s', "$1)$2", $wiki);
  318. $wiki = preg_replace('/\{\{otheruses\}\}/xi', "", $wiki);
  319. $wiki = preg_replace('/\{\{TOC[a-zA-Z]*\}\}/xi', "", $wiki);
  320. $wiki = preg_replace('/\{\{Unreferenced[^\}]*\}\}/xi', "", $wiki);
  321. // $wiki = preg_replace('/\{\{Audio\s*\|([^\|\}]*)\|([^\|\}]*)\}\}/i', "$2", $wiki);
  322. // $wiki = preg_replace('/\(\{\{IPA[^\}]*\}\}\)/', "", $wiki);
  323. //HACK leave Bio Templates in italian Wikipedia
  324. $wiki = str_replace('{{Bio', 'BIO35363', $wiki);
  325. // TODO: what does (? > and (?R) mean?
  326. preg_match_all('/\{\{((?>[^\{\}]+)|(?R))*\}\}/x',$wiki,$templates);
  327. // preg_match_all('/\{\{(?:(?!\{\{|\}\}).)*\}\}/x',$wiki,$templates);
  328. $wiki_array = split("( |\t)*\n( |\t)*", $wiki);
  329. array_walk($wiki_array, create_function('&$temp', $temp = trim($temp)));
  330. foreach($templates[0] as $tpl) {
  331. // echo "TEMPLATE: " . $tpl . "<hr>";
  332. // echo $tpl . "<hr>";
  333. // TODO: why do we need this? The regex above
  334. // only matches strings that start with '{'.
  335. if($tpl[0]!='{') {
  336. continue;
  337. }
  338. // If the template contains line breaks or is
  339. // in a separate line, remove it.
  340. if (strpos($tpl, "\n") || in_array($tpl, $wiki_array)) {
  341. $wiki = str_replace($tpl, "", $wiki);
  342. // if template would be a seperate line after deleting other templates, we wouldn't find it,
  343. // so we split the wiki page again
  344. // example: http://en.wikipedia.org/w/index.php?title=Vladimir_Lenin&oldid=300435749
  345. // Todo: Time costs, new errors?
  346. $wiki_array = split("( |\t)*\n( |\t)*", $wiki);
  347. }
  348. }
  349. // delete lines with only templates on it, keep lines with at least one word character besides templates
  350. $wiki_array = split("( |\t)*\n( |\t)*", $wiki);
  351. array_walk($wiki_array, create_function('&$temp', $temp = trim($temp)));
  352. foreach ($wiki_array as $id => $line) {
  353. $line_temp = $line;
  354. foreach ($templates[0] as $tpl) {
  355. $line_temp = str_replace($tpl, "", $line_temp);
  356. }
  357. $line_temp = trim(strip_tags($line_temp));
  358. // if we didn't change anything, don't delete the line.
  359. // TODO: we want to remove lines that contain stuff like '*' at the start and otherwise only templates,
  360. // but we may want to keep lines that contain templates but important wiki markup, like ']]'.
  361. if (($line_temp !== $line) && (! preg_match("~\pL~", $line_temp))) {
  362. unset($wiki_array[$id]);
  363. }
  364. }
  365. $wiki = implode("\n", $wiki_array);
  366. //HACK leave Bio Templates in italian Wikipedia
  367. $wiki = str_replace('BIO35363', '{{Bio', $wiki);
  368. return $wiki;
  369. }
  370. /**
  371. * Remove special links (ones whose target starts with a namespace).
  372. * @param $wiki the wiki source
  373. * @return the given wiki source minus some links
  374. */
  375. private static function cleanLinks( $wiki ) {
  376. return preg_replace("/\[\[[^\|\[\]\{\}\.]+:(?:[^\]]*?\[\[[^\]]*?\]\])*[^\[]*?\]\] */", "", $wiki);
  377. }
  378. /**
  379. * @param $wiki the wiki source
  380. * @return the given wiki source minus some links
  381. */
  382. private static function cleanRefs( $wiki ) {
  383. $wiki = preg_replace("~<ref(\s+[^>/]*)?/>~", "", $wiki);
  384. $wiki = preg_replace("~<ref(\s+[^>/]*)?>.*?</ref>~", "", $wiki);
  385. return $wiki;
  386. }
  387. static function print_html_stack() {
  388. $pathlen = strlen(dirname(__FILE__)) + 1;
  389. $stack = debug_backtrace(false);
  390. foreach ($stack as $index => $frame) {
  391. if ($index > 0) echo $frame['class'], '::', $frame['function'], ' (', $file, ':', $line, ')', "<br/>\n";
  392. $file = $frame['file'];
  393. $file = substr($file, $pathlen);
  394. $line = $frame['line'];
  395. }
  396. echo "<hr/>\n";
  397. }
  398. }