PageRenderTime 23ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/core/src/main/scala/org/dbpedia/extraction/mappings/AbstractExtractor.scala

https://gitlab.com/varunkothamachu/extraction-framework
Scala | 366 lines | 183 code | 58 blank | 125 comment | 21 complexity | f0052db6834dfcfc7df3ff82debc5b18 MD5 | raw file
  1. package org.dbpedia.extraction.mappings
  2. import java.io.{InputStream, OutputStreamWriter}
  3. import java.net.URL
  4. import java.util.logging.{Level, Logger}
  5. import org.dbpedia.extraction.destinations.{DBpediaDatasets, Quad, QuadBuilder}
  6. import org.dbpedia.extraction.ontology.Ontology
  7. import org.dbpedia.extraction.util.Language
  8. import org.dbpedia.extraction.wikiparser._
  9. import org.dbpedia.util.text.ParseExceptionIgnorer
  10. import org.dbpedia.util.text.html.{HtmlCoder, XmlCodes}
  11. import scala.io.Source
  12. import scala.language.reflectiveCalls
  13. import scala.xml.XML
  14. /**
  15. * Extracts page abstracts.
  16. *
  17. * From now on we use MobileFrontend for MW <2.21 and TextExtracts for MW > 2.22
  18. * The patched mw instance is no longer needed except from minor customizations in LocalSettings.php
  19. * TODO: we need to adapt the TextExtracts extension to accept custom wikicode syntax.
  20. * TextExtracts now uses the article entry and extracts the abstract. The retional for
  21. * the new extension is that we will not need to load all articles in MySQL, just the templates
  22. * At the moment, setting up the patched MW takes longer than the loading of all articles in MySQL :)
  23. * so, even this way it's way better and cleaner ;)
  24. * We leave the old code commented since we might re-use it soon
  25. */
  26. class AbstractExtractor(
  27. context : {
  28. def ontology : Ontology
  29. def language : Language
  30. }
  31. )
  32. extends PageNodeExtractor
  33. {
  34. //TODO make this configurable
  35. protected def apiUrl: String = "http://localhost/mediawiki/api.php"
  36. private val maxRetries = 3
  37. /** timeout for connection to web server, milliseconds */
  38. private val connectMs = 2000
  39. /** timeout for result from web server, milliseconds */
  40. private val readMs = 8000
  41. /** sleep between retries, milliseconds, multiplied by CPU load */
  42. private val sleepFactorMs = 4000
  43. private val language = context.language.wikiCode
  44. private val logger = Logger.getLogger(classOf[AbstractExtractor].getName)
  45. //private val apiParametersFormat = "uselang="+language+"&format=xml&action=parse&prop=text&title=%s&text=%s"
  46. private val apiParametersFormat = "uselang="+language+"&format=xml&action=query&prop=extracts&exintro=&explaintext=&titles=%s"
  47. // lazy so testing does not need ontology
  48. private lazy val shortProperty = context.ontology.properties("rdfs:comment")
  49. // lazy so testing does not need ontology
  50. private lazy val longProperty = context.ontology.properties("abstract")
  51. private lazy val longQuad = QuadBuilder(context.language, DBpediaDatasets.LongAbstracts, longProperty, null) _
  52. private lazy val shortQuad = QuadBuilder(context.language, DBpediaDatasets.ShortAbstracts, shortProperty, null) _
  53. override val datasets = Set(DBpediaDatasets.LongAbstracts, DBpediaDatasets.ShortAbstracts)
  54. private val osBean = java.lang.management.ManagementFactory.getOperatingSystemMXBean()
  55. private val availableProcessors = osBean.getAvailableProcessors()
  56. override def extract(pageNode : PageNode, subjectUri : String, pageContext : PageContext): Seq[Quad] =
  57. {
  58. //Only extract abstracts for pages from the Main namespace
  59. if(pageNode.title.namespace != Namespace.Main) return Seq.empty
  60. //Don't extract abstracts from redirect and disambiguation pages
  61. if(pageNode.isRedirect || pageNode.isDisambiguation) return Seq.empty
  62. //Reproduce wiki text for abstract
  63. //val abstractWikiText = getAbstractWikiText(pageNode)
  64. // if(abstractWikiText == "") return Seq.empty
  65. //Retrieve page text
  66. var text = retrievePage(pageNode.title /*, abstractWikiText*/)
  67. text = postProcess(pageNode.title, text)
  68. if (text.trim.isEmpty)
  69. return Seq.empty
  70. //Create a short version of the abstract
  71. val shortText = short(text)
  72. //Create statements
  73. val quadLong = longQuad(subjectUri, text, pageNode.sourceUri)
  74. val quadShort = shortQuad(subjectUri, shortText, pageNode.sourceUri)
  75. if (shortText.isEmpty)
  76. {
  77. Seq(quadLong)
  78. }
  79. else
  80. {
  81. Seq(quadLong, quadShort)
  82. }
  83. }
  84. /**
  85. * Retrieves a Wikipedia page.
  86. *
  87. * @param pageTitle The encoded title of the page
  88. * @return The page as an Option
  89. */
  90. def retrievePage(pageTitle : WikiTitle/*, pageWikiText : String*/) : String =
  91. {
  92. // The encoded title may contain some URI-escaped characters (e.g. "5%25-Klausel"),
  93. // so we can't use URLEncoder.encode(). But "&" is not escaped, so we do this here.
  94. // TODO: there may be other characters that need to be escaped.
  95. var titleParam = pageTitle.encodedWithNamespace
  96. AbstractExtractor.CHARACTERS_TO_ESCAPE foreach { case (search, replacement) =>
  97. titleParam = titleParam.replace(search, replacement);
  98. }
  99. // Fill parameters
  100. val parameters = apiParametersFormat.format(titleParam/*, URLEncoder.encode(pageWikiText, "UTF-8")*/)
  101. val url = new URL(apiUrl)
  102. for(counter <- 1 to maxRetries)
  103. {
  104. try
  105. {
  106. // Send data
  107. val conn = url.openConnection
  108. conn.setDoOutput(true)
  109. conn.setConnectTimeout(connectMs)
  110. conn.setReadTimeout(readMs)
  111. val writer = new OutputStreamWriter(conn.getOutputStream)
  112. writer.write(parameters)
  113. writer.flush()
  114. writer.close()
  115. // Read answer
  116. return readInAbstract(conn.getInputStream)
  117. }
  118. catch
  119. {
  120. case ex: Exception => {
  121. // The web server may still be trying to render the page. If we send new requests
  122. // at once, there will be more and more tasks running in the web server and the
  123. // system eventually becomes overloaded. So we wait a moment. The higher the load,
  124. // the longer we wait.
  125. var loadFactor = Double.NaN
  126. var sleepMs = sleepFactorMs
  127. // if the load average is not available, a negative value is returned
  128. val load = osBean.getSystemLoadAverage()
  129. if (load >= 0) {
  130. loadFactor = load / availableProcessors
  131. sleepMs = (loadFactor * sleepFactorMs).toInt
  132. }
  133. if (counter < maxRetries) {
  134. logger.log(Level.INFO, "Error retrieving abstract of " + pageTitle + ". Retrying after " + sleepMs + " ms. Load factor: " + loadFactor, ex)
  135. Thread.sleep(sleepMs)
  136. }
  137. else {
  138. ex match {
  139. case e : java.net.SocketTimeoutException => logger.log(Level.INFO,
  140. "Timeout error retrieving abstract of " + pageTitle + " in " + counter + " tries. Giving up. Load factor: " +
  141. loadFactor, ex)
  142. case _ => logger.log(Level.INFO,
  143. "Error retrieving abstract of " + pageTitle + " in " + counter + " tries. Giving up. Load factor: " +
  144. loadFactor, ex)
  145. }
  146. }
  147. }
  148. }
  149. }
  150. throw new Exception("Could not retrieve abstract for page: " + pageTitle)
  151. }
  152. /**
  153. * Returns the first sentences of the given text that have less than 500 characters.
  154. * A sentence ends with a dot followed by whitespace.
  155. * TODO: probably doesn't work for most non-European languages.
  156. * TODO: analyse ActiveAbstractExtractor, I think this works quite well there,
  157. * because it takes the first two or three sentences
  158. * @param text
  159. * @param max max length
  160. * @return result string
  161. */
  162. def short(text : String, max : Int = 500) : String =
  163. {
  164. if (text.size < max) return text
  165. val builder = new StringBuilder()
  166. var size = 0
  167. for(sentence <- text.split("""(?<=\.\s)"""))
  168. {
  169. if(size + sentence.size > max)
  170. {
  171. if (builder.isEmpty)
  172. {
  173. return sentence
  174. }
  175. return builder.toString().trim
  176. }
  177. size += sentence.size
  178. builder.append(sentence)
  179. }
  180. builder.toString().trim
  181. }
  182. /**
  183. * Get the parsed and cleaned abstract text from the MediaWiki instance input stream.
  184. * It returns
  185. * <api> <query> <pages> <page> <extract> ABSTRACT_TEXT <extract> <page> <pages> <query> <api>
  186. * /// <api> <parse> <text> ABSTRACT_TEXT </text> </parse> </api>
  187. */
  188. private def readInAbstract(inputStream : InputStream) : String =
  189. {
  190. // for XML format
  191. val xmlAnswer = Source.fromInputStream(inputStream, "UTF-8").getLines().mkString("")
  192. //val text = (XML.loadString(xmlAnswer) \ "parse" \ "text").text.trim
  193. var text = (XML.loadString(xmlAnswer) \ "query" \ "pages" \ "page" \ "extract").text.trim
  194. text = decodeHtml(text)
  195. for ((regex, replacement) <- AbstractExtractor.patternsToRemove) {
  196. val matches = regex.pattern.matcher(text)
  197. if (matches.find()) {
  198. text = matches.replaceAll(replacement)
  199. }
  200. }
  201. text
  202. }
  203. private def postProcess(pageTitle: WikiTitle, text: String): String =
  204. {
  205. val startsWithLowercase =
  206. if (text.isEmpty) {
  207. false
  208. } else {
  209. val firstLetter = text.substring(0,1)
  210. firstLetter != firstLetter.toUpperCase(context.language.locale)
  211. }
  212. //HACK
  213. if (startsWithLowercase)
  214. {
  215. val decodedTitle = pageTitle.decoded.replaceFirst(" \\(.+\\)$", "")
  216. if (! text.toLowerCase.contains(decodedTitle.toLowerCase))
  217. {
  218. // happens mainly for Japanese names (abstract starts with template)
  219. return decodedTitle + " " + text
  220. }
  221. }
  222. text
  223. }
  224. //private val destinationNamespacesToRender = List(Namespace.Main, Namespace.Template)
  225. /*
  226. private def renderNode(node : Node) = node match
  227. {
  228. case InternalLinkNode(destination, _, _, _) => destinationNamespacesToRender contains destination.namespace
  229. case ParserFunctionNode(_, _, _) => false
  230. case _ => true
  231. }
  232. */
  233. /**
  234. * Get the wiki text that contains the abstract text.
  235. */
  236. /*
  237. def getAbstractWikiText(pageNode : PageNode) : String =
  238. {
  239. // From first TextNode
  240. val start = pageNode.children.indexWhere{
  241. case TextNode(text, _) => text.trim != ""
  242. case InternalLinkNode(destination, _, _, _) => destination.namespace == Namespace.Main
  243. case _ => false
  244. }
  245. // To first SectionNode (exclusive)
  246. var end = pageNode.children.indexWhere{
  247. case sectionNode : SectionNode => true
  248. case _ => false
  249. }
  250. // If there is no SectionNode, To last non-empty TextNode (inclusive)
  251. if(end == -1)
  252. {
  253. val reverseLastTextIndex = pageNode.children.reverse.indexWhere{
  254. case TextNode(text, _) => text.trim != ""
  255. case _ => false
  256. }
  257. if(reverseLastTextIndex != -1)
  258. {
  259. end = pageNode.children.length - reverseLastTextIndex
  260. }
  261. }
  262. // No result if there is no TextNode or no text before a SectionNode
  263. if(start == -1 || end == -1 || start >= end)
  264. {
  265. return ""
  266. }
  267. // Re-generate wiki text for found range of nodes
  268. val text = pageNode.children.slice(start, end)
  269. .filter(renderNode)
  270. .map(_.toWikiText)
  271. .mkString("").trim
  272. // decode HTML entities - the result is plain text
  273. decodeHtml(text)
  274. }
  275. */
  276. def decodeHtml(text: String): String = {
  277. val coder = new HtmlCoder(XmlCodes.NONE)
  278. coder.setErrorHandler(ParseExceptionIgnorer.INSTANCE)
  279. coder.code(text)
  280. }
  281. }
  282. object AbstractExtractor {
  283. /**
  284. * List of all characters which are reserved in a query component according to RFC 2396
  285. * with their escape sequences as determined by the JavaScript function encodeURIComponent.
  286. */
  287. val CHARACTERS_TO_ESCAPE = List(
  288. (";", "%3B"),
  289. ("/", "%2F"),
  290. ("?", "%3F"),
  291. (":", "%3A"),
  292. ("@", "%40"),
  293. ("&", "%26"),
  294. ("=", "%3D"),
  295. ("+", "%2B"),
  296. (",", "%2C"),
  297. ("$", "%24")
  298. )
  299. val patternsToRemove = List(
  300. """<div style=[^/]*/>""".r -> " ",
  301. """</div>""".r -> " "
  302. )
  303. }