PageRenderTime 65ms CodeModel.GetById 34ms RepoModel.GetById 0ms app.codeStats 0ms

/dump/src/main/mediawiki/DBpediaFunctions.php

https://gitlab.com/varunkothamachu/extraction-framework
PHP | 517 lines | 455 code | 20 blank | 42 comment | 6 complexity | 37786f59cab85383e3f6bd7c2ab8c927 MD5 | raw file
  1. <?php
  2. /**
  3. * **************************************
  4. * NO LONGER NEEDED read the instructions
  5. * **************************************
  6. * Helper functions to clean up wiki source text for dbpedia.
  7. */
  8. class DBpediaFunctions {
  9. /*
  10. To make sure that Apache sends debug output to the browser,
  11. add the following to index.php:
  12. @apache_setenv('no-gzip', 1);
  13. @ini_set('zlib.output_compression', 0);
  14. @ini_set('implicit_flush', 1);
  15. for ($i = 0; $i < ob_get_level(); $i++) { ob_end_flush(); }
  16. ob_implicit_flush(1);
  17. */
  18. /**
  19. * Should Wiki and HTML code be cleaned? If false, generate HTML for the
  20. * introduction as usual. May be useful for debugging.
  21. */
  22. const CLEAN = true;
  23. private static function removeShortIPA( $matches ) {
  24. return substr_count($matches[1], " ") <= 3 ? "" : $matches[0];
  25. }
  26. private static function replaceTooManyLinks( $matches ) {
  27. $match = $matches[0];
  28. $countLinks = substr_count($match ,"<a ");
  29. if ($countLinks <= 0) return $match;
  30. $match_without_links = preg_replace("~<a(\s+[^>]*)?>.*?</a>~", "", $match);
  31. $matchWords = split(" ", strip_tags($match_without_links));
  32. return $countLinks / sizeof($matchWords) >= 0.25 ? "" : $match;
  33. }
  34. /**
  35. * Get HTML from $wgOut->getHTML(), call self::cleanHtml(), set HTML
  36. * back into $wgOut.
  37. */
  38. static function cleanOutput() {
  39. global $wgOut;
  40. if (! self::CLEAN) return;
  41. // wfProfileIn( __METHOD__ );
  42. $html = $wgOut->getHTML();
  43. $html = self::cleanHtml($html);
  44. $wgOut->clearHTML();
  45. $wgOut->addHTML($html);
  46. // wfProfileOut( __METHOD__ );
  47. }
  48. /**
  49. * Use this line for debugging:
  50. * echo __LINE__, "<p/>\n", $html, "<hr/>\n";
  51. * @param $html the html
  52. */
  53. static function cleanHtml( $html ) {
  54. $html = preg_replace("~<br ?/?>~", "", $html);
  55. // remove map references
  56. // example: http://en.wikipedia.org/w/index.php?title=Attila_the_Hun&oldid=300682051
  57. $html = preg_replace("~\s?\(see map below\)~", "", $html);
  58. // <a href="/wikipedia/index.php/Image:Ltspkr.png" title="Image:Ltspkr.png">Image:Ltspkr.png</a>
  59. $html = preg_replace("~<a\s[^>]*Image:Ltspkr\.png[^>]*>[^<]*</a>~", "", $html);
  60. $html = preg_replace("~<a\s[^>]*File:Loudspeaker\.svg[^>]*>[^<]*</a>~", "", $html);
  61. // [[]]
  62. // TODO: Log warning. Links should have been rendered / removed.
  63. $html = preg_replace("/\[\[[^\]]*?\]\]/", "", $html);
  64. // remove nested round brackets: (x(y)z -> (xz
  65. do {
  66. $in = $html;
  67. $html = preg_replace("~(\([^)(]*)\([^)(]*\)~", "$1", $html);
  68. } while ($html != $in);
  69. // remove nested brackets: (x[y]z -> (xz#
  70. do {
  71. $in = $html;
  72. // TODO: use "~(\([^)[]*)\[[^)]]*\]~" instead
  73. $html = preg_replace("~(\([^)]*)\[[^]]*\]~", "$1", $html);
  74. } while ($html != $in);
  75. // delete parentheses with too many links
  76. $html = preg_replace_callback("/\s?\(.*?\)/", 'DBpediaFunctions::replaceTooManyLinks', $html);
  77. // <a href="http://www.britannica.com/EBchecked/topic/509710/rose" class="external autonumber" title="http://www.britannica.com/EBchecked/topic/509710/rose" rel="nofollow">[1]</a>
  78. $html = preg_replace("~<a[^>]*class=\"external autonumber[^>]*>.*?</a>~", "", $html);
  79. $html = self::replaceAudioSpans($html);
  80. // <sup id="cite_ref-0" class="reference"><a href="#cite_note-0" title="">[1]</a></sup>
  81. $html = preg_replace("~<sup(\s+[^>]*)?>.*?</sup>~", "", $html);
  82. $html = preg_replace("~<small(\s+[^>]*)?>.*?</small>~", "", $html);
  83. // (pronounced <span title="Pronunciation in the International Phonetic Alphabet (IPA)" class="IPA">
  84. // <span class="IPA
  85. $html = preg_replace("~\([^)]*<span [^>]*class=\"[^\"]*IPA[^\"]*\"[^>]*>.*?</span>~", "(", $html);
  86. // <a href="/wikipedia/index.php/Help:Pronunciation" class="mw-redirect" title="Help:Pronunciation">pronounced</a> <span class="IP
  87. $html = preg_replace("~\(<a href[^>]*>.*?</a> <span [^>]*class=\"[^\"]*IPA[^\"]*\"[^>]*>.*?</span>~", "(", $html);
  88. // <a href="/wikipedia/index.php?title=Special:Upload&amp;wpDestFile=En-us-Denmark.ogg" class="new" title="En-us-Denmark.ogg">[?d?nm?rk]</a></span>
  89. $html = preg_replace("~<a .*?>\[(.*?)\]</a>~", "", $html);
  90. $html = preg_replace("~<a .*?>\[?(.*?)\]?</a>~", "$1", $html);
  91. //<strong class="error">
  92. $html = preg_replace("~<strong class=\"error\">.*?</strong>~", "", $html);
  93. $html = strip_tags($html);
  94. $html = str_replace("&nbsp;", " ", $html);
  95. $html = str_replace("&#32;", " ", $html);
  96. $html = str_replace("&#x20;", " ", $html);
  97. // TODO: why is it necessary to remove '//'? And why escape '/'?
  98. $html = str_replace("\/\/", "", $html);
  99. $html = self::removeNonsense($html);
  100. // remove " ' '" before ")" TODO: why?
  101. $html = preg_replace("/[\s]*'[\s]*'\)/", ")", $html);
  102. // remove "' ' " after "(" TODO: why?
  103. $html = preg_replace("/\('[\s]*'[\s]*/", "(", $html);
  104. // remove "xyz:,;..." after "("
  105. // example: (Sinhalese:, ... )
  106. $html = preg_replace("/\([^\s\)]*?:[,;]+/", "(", $html);
  107. // remove "(xyz: ')" and "(xyz: )"
  108. // example: (Arabic: ')
  109. // not necessary - we remove brackets containing colons below
  110. // $html = preg_replace("/\([^\s\)]*: '?\)/", "", $html);
  111. $html = self::removeNonsense($html);
  112. // remove "(... ')"
  113. $html = preg_replace("/\([^\s\)]* '\)/", "", $html);
  114. // remove "(' ...)"
  115. $html = preg_replace("/\(' [^\s\)]*\)/", "", $html);
  116. $html = self::removeNonsense($html);
  117. $html = str_replace("\n", " ", $html);
  118. $html = trim($html);
  119. //AUDIO stuff:
  120. // , IPA:?t???si:,
  121. $html = preg_replace_callback("/,\s*IPA:(.*?),/s", 'DBpediaFunctions::removeShortIPA', $html);
  122. // remove brackets containing colons.
  123. $html = preg_replace("/\([^\)]*?:[^\)]*?\)/s", "", $html);
  124. $html = self::removeNonsense($html);
  125. // if there's a closing bracket before an opening bracket, remove it.
  126. if (strpos($html, ")") < strpos($html, "(")) {
  127. $html = preg_replace('/\)/', '', $html, 1);
  128. }
  129. // Todo: what about "3.4 % of"?
  130. $html = preg_replace("/\.([^0-9.][^0-9.])/", ". $1", $html);
  131. $html = preg_replace("/ +/", " ", $html);
  132. return $html;
  133. }
  134. private static function removeNonsense($html) {
  135. //Remove spaces before commas and semicolons
  136. $html = preg_replace("/\s+,/", ",", $html);
  137. $html = preg_replace("/\s+;/", ";", $html);
  138. // TODO: when can this happen?
  139. // $html = preg_replace("/[;|,]..[;|,]/", ",", $html);
  140. //Remove spaces near brackets
  141. $html = preg_replace("~\(\s*~", "(", $html);
  142. $html = preg_replace("~\s*\)~", ")", $html);
  143. //Remove commas etc. near brackets
  144. $html = preg_replace("~\(([;,:]\s*)*~", "(", $html);
  145. $html = preg_replace("~(\s*[;,:])*\)~", ")", $html);
  146. //Remove empty brackets
  147. $html = preg_replace("/\[[\s]*\]/", "", $html);
  148. $html = preg_replace("/\([\s]*\)/", "", $html);
  149. return $html;
  150. }
  151. /**
  152. * TODO: Log if the following matches. It's probably no longer needed.
  153. */
  154. private static function replaceAudioSpans( $html ) {
  155. // <p><span class="unicode audiolink"><a href="http://upload.wikimedia.org/wikipedia/commons/b/b4/Stavanger.ogg" class="internal" title="Stavanger.ogg"><b>Stavanger</b></a></span>
  156. if ((strpos($html, "<span class=\"unicode audiolink\">") === 0) || (strpos($html, "<b><span class=\"unicode audiolink\">") === 0)) {
  157. /*
  158. $html = preg_replace("~<span class=\"unicode audiolink\"><a .*?>(.*?)</a></span>~s", "$1", $html);
  159. */
  160. $html = preg_replace("~<span class=\"unicode audiolink\">(.*?)</span>~s", "$1", $html);
  161. } else {
  162. //<span class="unicode audiolink"><a href="/wikipedia/index.php?title=Special:Upload&amp;wpDestFile=Eugen_Berthold_Friedrich_Brecht.ogg" class="new" title="Eugen Berthold Friedrich Brecht.ogg"><b>Eugen Berthold Friedrich Brecht</b></a></span>&nbsp;<span class="metadata audiolinkinfo">>(<a href="/wikipedia/index.php/Wikipedia:Media_help" title="Wikipedia:Media help">help</a>�<a href="/wikipedia/index.php?title=Image:Eugen_Berthold_Friedrich_Brecht.ogg&amp;action=edit&amp;redlink=1" class="new" title="Image:Eugen Berthold Friedrich Brecht.ogg (page does not exist)">info</a>)</small></span>
  163. $html = preg_replace("~<span class=\"unicode audiolink\"><a [^>]*><b>(.*?)</b></a></span>~s", "$1", $html);
  164. // <span class="unicode audiolink"><a href="/wikipedia/index.php?title=Special:Upload&amp;wpDestFile=Anime.ogg" class="new" title="Anime.ogg"><i>listen</i></a></span>&nbsp;
  165. $html = preg_replace("~<span class=\"unicode audiolink\">.*?</span>~s", "", $html);
  166. // replace audio spans in parentheses
  167. // (<span class="unicode" style="white-space: nowrap;"> <a href="/wikipedia_en/index.php?title=Special:Upload&amp;wpDestFile=It-Leonardo_di_ser_Piero_da_Vinci.ogg" class="new" title="It-Leonardo di ser Piero da Vinci.ogg">pronunciation</a> </span>
  168. while (preg_match("~\(([^)]*?)<span\sclass=\"unicode\"[^>]*>.*?</span>~s", $html)) {
  169. $html = preg_replace("~\(([^)]*?)<span\sclass=\"unicode\"[^>]*>.*?</span>~s", "($1", $html);
  170. }
  171. }
  172. // TODO: this shouldn't be replaced when it is at the beginning of the text
  173. //<span class="metadata audiolinkinfo"><small>(<a href="/wikipedia/index.php/Wikipedia:Media_help" title="Wikipedia:Media help">help</a>�
  174. $html = preg_replace("~<span class=\"metadata audiolinkinfo\">.*?</span>~s", "", $html);
  175. $html = preg_replace("~<span class=\"audiolinkinfo\">.*?</span>~s", "", $html);
  176. $html = preg_replace("~<a href=\"\/wikipedia\/index\.php\/Datei:Loudspeaker\.svg\" title=\"Datei:Loudspeaker.svg\">Datei:Loudspeaker\.svg<\/a>~", "", $html);
  177. // echo $html."<hr>";
  178. return $html;
  179. }
  180. /**
  181. * Use this line for debugging:
  182. * echo __LINE__, "<p/>\n", $wiki, "<hr/>\n";
  183. * @param $wiki the wiki source
  184. * @return a clean and short version of the given wiki source
  185. */
  186. static function getAbstract( $wiki ) {
  187. //wfProfileIn( __METHOD__ );
  188. $section = self::getFirstSection($wiki);
  189. //echo str_replace("\n", "<br>", $section) . "<br>---</br>";
  190. if (self::CLEAN) $section = self::cleanWikiText($section);
  191. // The first occurrence of a bold word probably distinguishes the introductory section.
  192. $section = self::getBoldSection($section);
  193. //wfProfileOut( __METHOD__ );
  194. return $section;
  195. }
  196. // only check the first three sections
  197. const MAX_SECTIONS = 3;
  198. /**
  199. * The first occurrence of a bold word probably distinguishes the introductory section.
  200. */
  201. private static function getFirstSection( $wiki ) {
  202. // remove comments
  203. $wiki = preg_replace('~ ?<!--.*?-->~s','', $wiki);
  204. // echo $wiki."<hr>";
  205. // split text into sections: separated by headings or horizontal lines
  206. $sections = preg_split("~(^=.*=\s*$|^----.*$)~m", $wiki, self::MAX_SECTIONS + 1, PREG_SPLIT_NO_EMPTY);
  207. $first_sections = "";
  208. foreach ($sections as $si => $section) {
  209. if ($si === self::MAX_SECTIONS) break;
  210. if (strpos($section, "'''") !== false) {
  211. // echo "<h1>FOUND ''' IN</H1>" . $section . "<hr>";
  212. $parts = explode("'''", $section);
  213. $passed_parts = "";
  214. foreach ($parts as $part) {
  215. $passed_parts .= $part;
  216. $first_section_plus_part = $first_sections . $passed_parts;
  217. $count_open_brackets = substr_count($first_section_plus_part, "{{");
  218. $count_closing_brackets = substr_count($first_section_plus_part, "}}");
  219. if ($count_open_brackets <= $count_closing_brackets) {
  220. // echo "<h1>FOUND IN</H1>" . $first_sections . $section . "<hr>";
  221. return $first_sections . $section;
  222. }
  223. }
  224. }
  225. $first_sections .= $section;
  226. }
  227. // No bold word? Just use the first section. TODO: improve the heuristic.
  228. return strlen($sections[0]) >= 20 ? $sections[0] : "";
  229. }
  230. /**
  231. * The first occurrence of a bold word probably distinguishes the introductory section.
  232. */
  233. private static function getBoldSection( $wiki ) {
  234. // split text into sections: separated by headings or horizontal lines
  235. $sections = preg_split("~(^=.*=\s*$|^----.*$)~m", $wiki, self::MAX_SECTIONS + 1, PREG_SPLIT_NO_EMPTY);
  236. foreach ($sections as $si => $section) {
  237. if ($si === self::MAX_SECTIONS) break;
  238. // split section into paragraphs: separated by empty lines
  239. $paras = preg_split("~\n\s*\n~", $section, -1, PREG_SPLIT_NO_EMPTY);
  240. foreach ($paras as $pi => $para) {
  241. // remove all paragraphs before the one containing the bold word
  242. if (strpos($para, "'''") !== false) {
  243. //echo "BOLD SECTION!!! ".$section;
  244. return $pi === 0 ? $section : implode(' ', array_slice($paras, $pi));
  245. }
  246. }
  247. }
  248. // No bold word? Just use the first section. TODO: improve the heuristic.
  249. //echo "SECTION: ".$sections[0]."<hr>";
  250. return strlen($sections[0]) >= 20 ? $sections[0] : "";
  251. }
  252. private static function cleanWikiText( $wiki ) {
  253. /*
  254. $count_open_brackets = substr_count($wiki, "{{");
  255. $count_closing_brackets = substr_count($wiki, "}}");
  256. if ($count_open_brackets > $count_closing_brackets) {
  257. // If template brackets don't match - which is the case in roughly
  258. // 1000 articles of 3 mio - the recursive regex in cleanTemplates()
  259. // may crash our process with a stack overflow. To avoid this, simply
  260. // don't extract an abstract for this page.
  261. // return "";
  262. }
  263. */
  264. $wiki = str_replace("\r\n", "\n", $wiki);
  265. $wiki = str_replace("\r", "\n", $wiki);
  266. // remove comments
  267. // moved to getBoldSection $wiki = preg_replace('~ ?<!--.*?-->~s','', $wiki);
  268. // remove tables
  269. // was: $wiki = preg_replace('~{\|.*?\|}~s','', $wiki);
  270. $wiki = preg_replace('~^[\s|:]*\{\|~m', "12345654321", $wiki);
  271. $wiki = preg_replace('~^[\s|:]*\|\}(?!\})~m', "98765456789", $wiki);
  272. // echo $wiki . "<hr>";
  273. $wiki = self::cleanTemplates($wiki);
  274. // echo $wiki . "<hr>";
  275. $wiki = str_replace("12345654321", "{|", $wiki);
  276. $wiki = str_replace("98765456789", "|}", $wiki);
  277. // echo $wiki . "<hr>";
  278. $wiki_array = explode("\n", $wiki);
  279. //array_walk($wiki_array, create_function('&$temp', $temp = trim($temp)));
  280. $wiki = "";
  281. $c = 0;
  282. foreach ($wiki_array as $line) {
  283. $tLine = ltrim($line);
  284. if ((strlen($tLine) >= 2) && (strncmp($tLine, "{|", 2) == 0)) {
  285. $c += 1;
  286. } else if (($c > 0) && (strlen($tLine) >= 2) && (strncmp($tLine, "|}", 2) == 0)) {
  287. $c -= 1;
  288. if ($c == 0 && strlen($tLine) >= 3) $wiki .= substr($tline, 2) . "\n";
  289. } else if ($c == 0) $wiki .= $line . "\n";
  290. }
  291. /*
  292. do {
  293. $in = $wiki;
  294. if (strpos($wiki, "{|") !== false) {
  295. $wiki = preg_replace('~\{\|((?!\{\||\|\}).)*\|\}~s', '', $wiki);
  296. }
  297. } while ($wiki != $in);
  298. */
  299. // clean <math> </math>
  300. $wiki = preg_replace("~<math(\s+[^>/]*)?>.*?</math>~s", "", $wiki);
  301. // clean <imagemap>
  302. $wiki = preg_replace("~<imagemap(\s+[^>/]*)?>.*?</imagemap>~s", "", $wiki);
  303. // clean <gallery>
  304. $wiki = preg_replace("~<gallery(\s+[^>/]*)?>.*?</gallery>~s", "", $wiki);
  305. // first clean references, then clean links (there might be special links with refs as parameters)
  306. $wiki = self::cleanRefs($wiki);
  307. $wiki = self::cleanLinks($wiki);
  308. return $wiki;
  309. }
  310. /**
  311. * Remove some templates: 'otheruses', 'TOC', 'Unreferenced',
  312. * 'Audio',
  313. * and templates that contain line breaks or are in a separate line.
  314. * @param $wiki the wiki source
  315. * @return the given wiki source minus some templates
  316. */
  317. private static function cleanTemplates( $wiki ) {
  318. //Remove single curly braces
  319. $wiki = preg_replace('/(?<!\{)\{(?!\{)/s', "$1($2", $wiki);
  320. $wiki = preg_replace('/(?<!\})\}(?!\})/s', "$1)$2", $wiki);
  321. $wiki = preg_replace('/\{\{otheruses\}\}/xi', "", $wiki);
  322. $wiki = preg_replace('/\{\{TOC[a-zA-Z]*\}\}/xi', "", $wiki);
  323. $wiki = preg_replace('/\{\{Unreferenced[^\}]*\}\}/xi', "", $wiki);
  324. // $wiki = preg_replace('/\{\{Audio\s*\|([^\|\}]*)\|([^\|\}]*)\}\}/i', "$2", $wiki);
  325. // $wiki = preg_replace('/\(\{\{IPA[^\}]*\}\}\)/', "", $wiki);
  326. //HACK leave Bio Templates in italian Wikipedia
  327. $wiki = str_replace('{{Bio', 'BIO35363', $wiki);
  328. // TODO: what does (? > and (?R) mean?
  329. preg_match_all('/\{\{((?>[^\{\}]+)|(?R))*\}\}/x',$wiki,$templates);
  330. // preg_match_all('/\{\{(?:(?!\{\{|\}\}).)*\}\}/x',$wiki,$templates);
  331. $wiki_array = split("( |\t)*\n( |\t)*", $wiki);
  332. array_walk($wiki_array, create_function('&$temp', $temp = trim($temp)));
  333. foreach($templates[0] as $tpl) {
  334. // echo "TEMPLATE: " . $tpl . "<hr>";
  335. // echo $tpl . "<hr>";
  336. // TODO: why do we need this? The regex above
  337. // only matches strings that start with '{'.
  338. if($tpl[0]!='{') {
  339. continue;
  340. }
  341. // If the template contains line breaks or is
  342. // in a separate line, remove it.
  343. if (strpos($tpl, "\n") || in_array($tpl, $wiki_array)) {
  344. $wiki = str_replace($tpl, "", $wiki);
  345. // if template would be a seperate line after deleting other templates, we wouldn't find it,
  346. // so we split the wiki page again
  347. // example: http://en.wikipedia.org/w/index.php?title=Vladimir_Lenin&oldid=300435749
  348. // Todo: Time costs, new errors?
  349. $wiki_array = split("( |\t)*\n( |\t)*", $wiki);
  350. }
  351. }
  352. // delete lines with only templates on it, keep lines with at least one word character besides templates
  353. $wiki_array = split("( |\t)*\n( |\t)*", $wiki);
  354. array_walk($wiki_array, create_function('&$temp', $temp = trim($temp)));
  355. foreach ($wiki_array as $id => $line) {
  356. $line_temp = $line;
  357. foreach ($templates[0] as $tpl) {
  358. $line_temp = str_replace($tpl, "", $line_temp);
  359. }
  360. $line_temp = trim(strip_tags($line_temp));
  361. // if we didn't change anything, don't delete the line.
  362. // TODO: we want to remove lines that contain stuff like '*' at the start and otherwise only templates,
  363. // but we may want to keep lines that contain templates but important wiki markup, like ']]'.
  364. if (($line_temp !== $line) && (! preg_match("~\pL~", $line_temp))) {
  365. unset($wiki_array[$id]);
  366. }
  367. }
  368. $wiki = implode("\n", $wiki_array);
  369. //HACK leave Bio Templates in italian Wikipedia
  370. $wiki = str_replace('BIO35363', '{{Bio', $wiki);
  371. return $wiki;
  372. }
  373. /**
  374. * Remove special links (ones whose target starts with a namespace).
  375. * @param $wiki the wiki source
  376. * @return the given wiki source minus some links
  377. */
  378. private static function cleanLinks( $wiki ) {
  379. return preg_replace("/\[\[[^\|\[\]\{\}\.]+:(?:[^\]]*?\[\[[^\]]*?\]\])*[^\[]*?\]\] */", "", $wiki);
  380. }
  381. /**
  382. * @param $wiki the wiki source
  383. * @return the given wiki source minus some links
  384. */
  385. private static function cleanRefs( $wiki ) {
  386. $wiki = preg_replace("~<ref(\s+[^>/]*)?/>~", "", $wiki);
  387. $wiki = preg_replace("~<ref(\s+[^>/]*)?>.*?</ref>~", "", $wiki);
  388. return $wiki;
  389. }
  390. static function print_html_stack() {
  391. $pathlen = strlen(dirname(__FILE__)) + 1;
  392. $stack = debug_backtrace(false);
  393. foreach ($stack as $index => $frame) {
  394. if ($index > 0) echo $frame['class'], '::', $frame['function'], ' (', $file, ':', $line, ')', "<br/>\n";
  395. $file = $frame['file'];
  396. $file = substr($file, $pathlen);
  397. $line = $frame['line'];
  398. }
  399. echo "<hr/>\n";
  400. }
  401. }