PageRenderTime 48ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/core/src/main/scala/org/dbpedia/extraction/mappings/MissingAbstractsExtractor.scala

https://gitlab.com/varunkothamachu/extraction-framework
Scala | 402 lines | 214 code | 62 blank | 126 comment | 26 complexity | ec6e13ca9a331bc7f52a31bd7f2593b3 MD5 | raw file
  1. package org.dbpedia.extraction.mappings
  2. import scala.collection.mutable
  3. import scala.xml.XML
  4. import scala.io.Source
  5. import scala.language.reflectiveCalls
  6. import java.io._
  7. import java.net.{URLEncoder, URL}
  8. import java.util.logging.{Logger, Level}
  9. import org.dbpedia.extraction.destinations.{DBpediaDatasets,Quad,QuadBuilder}
  10. import org.dbpedia.extraction.wikiparser._
  11. import org.dbpedia.extraction.ontology.Ontology
  12. import org.dbpedia.extraction.util.Language
  13. import org.dbpedia.util.text.html.{HtmlCoder, XmlCodes}
  14. import org.dbpedia.util.text.ParseExceptionIgnorer
  15. /**
  16. * Extracts page abstracts which are not yet extracted. For each page which is a candidate for extraction
  17. *
  18. * From now on we use MobileFrontend for MW <2.21 and TextExtracts for MW > 2.22
  19. * The patched mw instance is no longer needed except from minor customizations in LocalSettings.php
  20. * TODO: we need to adapt the TextExtracts extension to accept custom wikicode syntax.
  21. * TextExtracts now uses the article entry and extracts the abstract. The retional for
  22. * the new extension is that we will not need to load all articles in MySQL, just the templates
  23. * At the moment, setting up the patched MW takes longer than the loading of all articles in MySQL :)
  24. * so, even this way it's way better and cleaner ;)
  25. * We leave the old code commented since we might re-use it soon
  26. */
  27. class MissingAbstractsExtractor(
  28. context : {
  29. def ontology : Ontology
  30. def language : Language
  31. }
  32. )
  33. extends PageNodeExtractor
  34. {
  35. //TODO make this configurable
  36. protected def apiUrl: String = "http://localhost/mediawiki/api.php"
  37. private val maxRetries = 3
  38. /** timeout for connection to web server, milliseconds */
  39. private val connectMs = 2000
  40. /** timeout for result from web server, milliseconds */
  41. private val readMs = 8000
  42. /** sleep between retries, milliseconds, multiplied by CPU load */
  43. private val sleepFactorMs = 4000
  44. private val language = context.language.wikiCode
  45. private val logger = Logger.getLogger(classOf[AbstractExtractor].getName)
  46. //private val apiParametersFormat = "uselang="+language+"&format=xml&action=parse&prop=text&title=%s&text=%s"
  47. private val apiParametersFormat = "uselang="+language+"&format=xml&action=query&prop=extracts&exintro=&explaintext=&titles=%s"
  48. // lazy so testing does not need ontology
  49. private lazy val shortProperty = context.ontology.properties("rdfs:comment")
  50. // lazy so testing does not need ontology
  51. private lazy val longProperty = context.ontology.properties("abstract")
  52. private lazy val longQuad = QuadBuilder(context.language, DBpediaDatasets.MissingLongAbstracts, longProperty, null) _
  53. private lazy val shortQuad = QuadBuilder(context.language, DBpediaDatasets.MissingShortAbstracts, shortProperty, null) _
  54. override val datasets = Set(DBpediaDatasets.MissingLongAbstracts, DBpediaDatasets.MissingShortAbstracts)
  55. private val osBean = java.lang.management.ManagementFactory.getOperatingSystemMXBean()
  56. private val availableProcessors = osBean.getAvailableProcessors()
  57. override def extract(pageNode : PageNode, subjectUri : String, pageContext : PageContext): Seq[Quad] =
  58. {
  59. // only run extraction if subjectUri is not in list of extracted data
  60. if (MissingAbstractsExtractor.existingAbstracts(subjectUri)) {
  61. Seq.empty
  62. }
  63. else {
  64. //Only extract abstracts for pages from the Main namespace
  65. if (pageNode.title.namespace != Namespace.Main) {
  66. return Seq.empty
  67. }
  68. //Don't extract abstracts from redirect and disambiguation pages
  69. if (pageNode.isRedirect || pageNode.isDisambiguation) {
  70. return Seq.empty
  71. }
  72. println(s"Detected missing abstract for '$subjectUri'")
  73. //Reproduce wiki text for abstract
  74. //val abstractWikiText = getAbstractWikiText(pageNode)
  75. // if(abstractWikiText == "") return Seq.empty
  76. //Retrieve page text
  77. var text = retrievePage(pageNode.title /*, abstractWikiText*/)
  78. text = postProcess(pageNode.title, text)
  79. if (text.trim.isEmpty) {
  80. logger.info(s"Empty abstract for subject $subjectUri")
  81. return Seq.empty
  82. }
  83. //Create a short version of the abstract
  84. val shortText = short(text)
  85. //Create statements
  86. val quadLong = longQuad(subjectUri, text, pageNode.sourceUri)
  87. val quadShort = shortQuad(subjectUri, shortText, pageNode.sourceUri)
  88. if (shortText.isEmpty) {
  89. Seq(quadLong)
  90. }
  91. else {
  92. Seq(quadLong, quadShort)
  93. }
  94. }
  95. }
  96. /**
  97. * Retrieves a Wikipedia page.
  98. *
  99. * @param pageTitle The encoded title of the page
  100. * @return The page as an Option
  101. */
  102. def retrievePage(pageTitle : WikiTitle/*, pageWikiText : String*/) : String =
  103. {
  104. // The encoded title may contain some URI-escaped characters (e.g. "5%25-Klausel"),
  105. // so we can't use URLEncoder.encode(). But "&" is not escaped, so we do this here.
  106. // TODO: there may be other characters that need to be escaped.
  107. var titleParam = pageTitle.encodedWithNamespace
  108. AbstractExtractor.CHARACTERS_TO_ESCAPE foreach { case (search, replacement) =>
  109. titleParam = titleParam.replace(search, replacement);
  110. }
  111. // Fill parameters
  112. val parameters = apiParametersFormat.format(titleParam/*, URLEncoder.encode(pageWikiText, "UTF-8")*/)
  113. val url = new URL(apiUrl)
  114. for(counter <- 1 to maxRetries)
  115. {
  116. try
  117. {
  118. // Send data
  119. val conn = url.openConnection
  120. conn.setDoOutput(true)
  121. conn.setConnectTimeout(connectMs)
  122. conn.setReadTimeout(readMs)
  123. val writer = new OutputStreamWriter(conn.getOutputStream)
  124. writer.write(parameters)
  125. writer.flush()
  126. writer.close()
  127. // Read answer
  128. return readInAbstract(conn.getInputStream)
  129. }
  130. catch
  131. {
  132. case ex: Exception => {
  133. // The web server may still be trying to render the page. If we send new requests
  134. // at once, there will be more and more tasks running in the web server and the
  135. // system eventually becomes overloaded. So we wait a moment. The higher the load,
  136. // the longer we wait.
  137. var loadFactor = Double.NaN
  138. var sleepMs = sleepFactorMs
  139. // if the load average is not available, a negative value is returned
  140. val load = osBean.getSystemLoadAverage()
  141. if (load >= 0) {
  142. loadFactor = load / availableProcessors
  143. sleepMs = (loadFactor * sleepFactorMs).toInt
  144. }
  145. if (counter < maxRetries) {
  146. logger.log(Level.INFO, "Error retrieving abstract of " + pageTitle + ". Retrying after " + sleepMs + " ms. Load factor: " + loadFactor, ex)
  147. Thread.sleep(sleepMs)
  148. }
  149. else {
  150. ex match {
  151. case e : java.net.SocketTimeoutException => logger.log(Level.INFO,
  152. "Timeout error retrieving abstract of " + pageTitle + " in " + counter + " tries. Giving up. Load factor: " +
  153. loadFactor, ex)
  154. case _ => logger.log(Level.INFO,
  155. "Error retrieving abstract of " + pageTitle + " in " + counter + " tries. Giving up. Load factor: " +
  156. loadFactor, ex)
  157. }
  158. }
  159. }
  160. }
  161. }
  162. throw new Exception("Could not retrieve abstract for page: " + pageTitle)
  163. }
  164. /**
  165. * Returns the first sentences of the given text that have less than 500 characters.
  166. * A sentence ends with a dot followed by whitespace.
  167. * TODO: probably doesn't work for most non-European languages.
  168. * TODO: analyse ActiveAbstractExtractor, I think this works quite well there,
  169. * because it takes the first two or three sentences
  170. * @param text
  171. * @param max max length
  172. * @return result string
  173. */
  174. def short(text : String, max : Int = 500) : String =
  175. {
  176. if (text.size < max) return text
  177. val builder = new StringBuilder()
  178. var size = 0
  179. for(sentence <- text.split("""(?<=\.\s)"""))
  180. {
  181. if(size + sentence.size > max)
  182. {
  183. if (builder.isEmpty)
  184. {
  185. return sentence
  186. }
  187. return builder.toString().trim
  188. }
  189. size += sentence.size
  190. builder.append(sentence)
  191. }
  192. builder.toString().trim
  193. }
  194. /**
  195. * Get the parsed and cleaned abstract text from the MediaWiki instance input stream.
  196. * It returns
  197. * <api> <query> <pages> <page> <extract> ABSTRACT_TEXT <extract> <page> <pages> <query> <api>
  198. * /// <api> <parse> <text> ABSTRACT_TEXT </text> </parse> </api>
  199. */
  200. private def readInAbstract(inputStream : InputStream) : String =
  201. {
  202. // for XML format
  203. val xmlAnswer = Source.fromInputStream(inputStream, "UTF-8").getLines().mkString("")
  204. //val text = (XML.loadString(xmlAnswer) \ "parse" \ "text").text.trim
  205. val text = (XML.loadString(xmlAnswer) \ "query" \ "pages" \ "page" \ "extract").text.trim
  206. decodeHtml(text)
  207. }
  208. private def postProcess(pageTitle: WikiTitle, text: String): String =
  209. {
  210. val startsWithLowercase =
  211. if (text.isEmpty) {
  212. false
  213. } else {
  214. val firstLetter = text.substring(0,1)
  215. firstLetter != firstLetter.toUpperCase(context.language.locale)
  216. }
  217. //HACK
  218. if (startsWithLowercase)
  219. {
  220. val decodedTitle = pageTitle.decoded.replaceFirst(" \\(.+\\)$", "")
  221. if (! text.toLowerCase.contains(decodedTitle.toLowerCase))
  222. {
  223. // happens mainly for Japanese names (abstract starts with template)
  224. return decodedTitle + " " + text
  225. }
  226. }
  227. text
  228. }
  229. //private val destinationNamespacesToRender = List(Namespace.Main, Namespace.Template)
  230. /*
  231. private def renderNode(node : Node) = node match
  232. {
  233. case InternalLinkNode(destination, _, _, _) => destinationNamespacesToRender contains destination.namespace
  234. case ParserFunctionNode(_, _, _) => false
  235. case _ => true
  236. }
  237. */
  238. /**
  239. * Get the wiki text that contains the abstract text.
  240. */
  241. /*
  242. def getAbstractWikiText(pageNode : PageNode) : String =
  243. {
  244. // From first TextNode
  245. val start = pageNode.children.indexWhere{
  246. case TextNode(text, _) => text.trim != ""
  247. case InternalLinkNode(destination, _, _, _) => destination.namespace == Namespace.Main
  248. case _ => false
  249. }
  250. // To first SectionNode (exclusive)
  251. var end = pageNode.children.indexWhere{
  252. case sectionNode : SectionNode => true
  253. case _ => false
  254. }
  255. // If there is no SectionNode, To last non-empty TextNode (inclusive)
  256. if(end == -1)
  257. {
  258. val reverseLastTextIndex = pageNode.children.reverse.indexWhere{
  259. case TextNode(text, _) => text.trim != ""
  260. case _ => false
  261. }
  262. if(reverseLastTextIndex != -1)
  263. {
  264. end = pageNode.children.length - reverseLastTextIndex
  265. }
  266. }
  267. // No result if there is no TextNode or no text before a SectionNode
  268. if(start == -1 || end == -1 || start >= end)
  269. {
  270. return ""
  271. }
  272. // Re-generate wiki text for found range of nodes
  273. val text = pageNode.children.slice(start, end)
  274. .filter(renderNode)
  275. .map(_.toWikiText)
  276. .mkString("").trim
  277. // decode HTML entities - the result is plain text
  278. decodeHtml(text)
  279. }
  280. */
  281. def decodeHtml(text: String): String = {
  282. val coder = new HtmlCoder(XmlCodes.NONE)
  283. coder.setErrorHandler(ParseExceptionIgnorer.INSTANCE)
  284. coder.code(text)
  285. }
  286. }
  287. object MissingAbstractsExtractor {
  288. private val logger = Logger.getLogger(classOf[MissingAbstractsExtractor].getName)
  289. /**
  290. * List of all characters which are reserved in a query component according to RFC 2396
  291. * with their escape sequences as determined by the JavaScript function encodeURIComponent.
  292. */
  293. val CHARACTERS_TO_ESCAPE = List(
  294. (";", "%3B"),
  295. ("/", "%2F"),
  296. ("?", "%3F"),
  297. (":", "%3A"),
  298. ("@", "%40"),
  299. ("&", "%26"),
  300. ("=", "%3D"),
  301. ("+", "%2B"),
  302. (",", "%2C"),
  303. ("$", "%24")
  304. )
  305. lazy val existingAbstracts = {
  306. val file = new File("existing-abstracts.tsv")
  307. logger.info(s"Starting to read list of existing abstracts from file '${file.getAbsolutePath}'")
  308. val reader = try {
  309. new BufferedReader(new FileReader("existing-abstracts.tsv"))
  310. }
  311. catch {
  312. case e: FileNotFoundException => logger.severe(s"Unable to find file '${file.getAbsolutePath}'." +
  313. s"Please generate it and put it in the given location.")
  314. throw e
  315. case e : Throwable => throw e
  316. }
  317. val set: mutable.HashSet[String] = mutable.HashSet()
  318. var line: String = null
  319. var first = true
  320. while ( {
  321. line = reader.readLine(); line != null
  322. }) {
  323. if (first) {
  324. first = false
  325. }
  326. else {
  327. val parts = line.split("\t")
  328. set.add(parts(1))
  329. }
  330. }
  331. reader.close()
  332. logger.info(s"Done reading existing abstract names: ${set.size} abstracts already existing")
  333. set
  334. }
  335. }