AbstractExtractor.scala - TODO make this configurable

/core/src/main/scala/org/dbpedia/extraction/mappings/AbstractExtractor.scala

https://gitlab.com/varunkothamachu/extraction-framework · Scala · 366 lines · 183 code · 58 blank · 125 comment · 21 complexity · f0052db6834dfcfc7df3ff82debc5b18 MD5 · raw file

package org.dbpedia.extraction.mappings

import java.io.{InputStream, OutputStreamWriter}
import java.net.URL
import java.util.logging.{Level, Logger}

import org.dbpedia.extraction.destinations.{DBpediaDatasets, Quad, QuadBuilder}
import org.dbpedia.extraction.ontology.Ontology
import org.dbpedia.extraction.util.Language
import org.dbpedia.extraction.wikiparser._
import org.dbpedia.util.text.ParseExceptionIgnorer
import org.dbpedia.util.text.html.{HtmlCoder, XmlCodes}

import scala.io.Source
import scala.language.reflectiveCalls
import scala.xml.XML

/**
 * Extracts page abstracts.
 *
 * From now on we use MobileFrontend for MW <2.21 and TextExtracts for MW > 2.22
 * The patched mw instance is no longer needed except from minor customizations in LocalSettings.php
 * TODO: we need to adapt the TextExtracts extension to accept custom wikicode syntax.
 * TextExtracts now uses the article entry and extracts the abstract. The retional for
 * the new extension is that we will not need to load all articles in MySQL, just the templates
 * At the moment, setting up the patched MW takes longer than the loading of all articles in MySQL :)
 * so, even this way it's way better and cleaner ;)
 * We leave the old code commented since we might re-use it soon
 */

class AbstractExtractor(
  context : {
    def ontology : Ontology
    def language : Language
  }
)
extends PageNodeExtractor
{
    //TODO make this configurable
    protected def apiUrl: String = "http://localhost/mediawiki/api.php"

    private val maxRetries = 3

    /** timeout for connection to web server, milliseconds */
    private val connectMs = 2000

    /** timeout for result from web server, milliseconds */
    private val readMs = 8000

    /** sleep between retries, milliseconds, multiplied by CPU load */
    private val sleepFactorMs = 4000

    private val language = context.language.wikiCode

    private val logger = Logger.getLogger(classOf[AbstractExtractor].getName)

    //private val apiParametersFormat = "uselang="+language+"&format=xml&action=parse&prop=text&title=%s&text=%s"
    private val apiParametersFormat = "uselang="+language+"&format=xml&action=query&prop=extracts&exintro=&explaintext=&titles=%s"

    // lazy so testing does not need ontology
    private lazy val shortProperty = context.ontology.properties("rdfs:comment")

    // lazy so testing does not need ontology
    private lazy val longProperty = context.ontology.properties("abstract")
    
    private lazy val longQuad = QuadBuilder(context.language, DBpediaDatasets.LongAbstracts, longProperty, null) _
    private lazy val shortQuad = QuadBuilder(context.language, DBpediaDatasets.ShortAbstracts, shortProperty, null) _
    
    override val datasets = Set(DBpediaDatasets.LongAbstracts, DBpediaDatasets.ShortAbstracts)

    private val osBean = java.lang.management.ManagementFactory.getOperatingSystemMXBean()

    private val availableProcessors = osBean.getAvailableProcessors()

    override def extract(pageNode : PageNode, subjectUri : String, pageContext : PageContext): Seq[Quad] =
    {
        //Only extract abstracts for pages from the Main namespace
        if(pageNode.title.namespace != Namespace.Main) return Seq.empty

        //Don't extract abstracts from redirect and disambiguation pages
        if(pageNode.isRedirect || pageNode.isDisambiguation) return Seq.empty

        //Reproduce wiki text for abstract
        //val abstractWikiText = getAbstractWikiText(pageNode)
        // if(abstractWikiText == "") return Seq.empty

        //Retrieve page text
        var text = retrievePage(pageNode.title /*, abstractWikiText*/)

        text = postProcess(pageNode.title, text)

        if (text.trim.isEmpty)
          return Seq.empty

        //Create a short version of the abstract
        val shortText = short(text)

        //Create statements
        val quadLong = longQuad(subjectUri, text, pageNode.sourceUri)
        val quadShort = shortQuad(subjectUri, shortText, pageNode.sourceUri)

        if (shortText.isEmpty)
        {
            Seq(quadLong)
        }
        else
        {
            Seq(quadLong, quadShort)
        }
    }


    /**
     * Retrieves a Wikipedia page.
     *
     * @param pageTitle The encoded title of the page
     * @return The page as an Option
     */
    def retrievePage(pageTitle : WikiTitle/*, pageWikiText : String*/) : String =
    {
      // The encoded title may contain some URI-escaped characters (e.g. "5%25-Klausel"),
      // so we can't use URLEncoder.encode(). But "&" is not escaped, so we do this here.
      // TODO: there may be other characters that need to be escaped.
      var titleParam = pageTitle.encodedWithNamespace
      AbstractExtractor.CHARACTERS_TO_ESCAPE foreach { case (search, replacement) =>
        titleParam = titleParam.replace(search, replacement);
      }
      
      // Fill parameters
      val parameters = apiParametersFormat.format(titleParam/*, URLEncoder.encode(pageWikiText, "UTF-8")*/)

      val url = new URL(apiUrl)
      
      for(counter <- 1 to maxRetries)
      {
        try
        {
          // Send data
          val conn = url.openConnection
          conn.setDoOutput(true)
          conn.setConnectTimeout(connectMs)
          conn.setReadTimeout(readMs)
          val writer = new OutputStreamWriter(conn.getOutputStream)
          writer.write(parameters)
          writer.flush()
          writer.close()

          // Read answer
          return readInAbstract(conn.getInputStream)
        }
        catch
        {
          case ex: Exception => {
            
            // The web server may still be trying to render the page. If we send new requests
            // at once, there will be more and more tasks running in the web server and the
            // system eventually becomes overloaded. So we wait a moment. The higher the load,
            // the longer we wait.

            var loadFactor = Double.NaN
            var sleepMs = sleepFactorMs
 
            // if the load average is not available, a negative value is returned
            val load = osBean.getSystemLoadAverage()
            if (load >= 0) {
              loadFactor = load / availableProcessors
              sleepMs = (loadFactor * sleepFactorMs).toInt
            }

            if (counter < maxRetries) {
              logger.log(Level.INFO, "Error retrieving abstract of " + pageTitle + ". Retrying after " + sleepMs + " ms. Load factor: " + loadFactor, ex)
              Thread.sleep(sleepMs)
            }
            else {
              ex match {
                case e : java.net.SocketTimeoutException => logger.log(Level.INFO,
                  "Timeout error retrieving abstract of " + pageTitle + " in " + counter + " tries. Giving up. Load factor: " +
                    loadFactor, ex)
                case _ => logger.log(Level.INFO,
                  "Error retrieving abstract of " + pageTitle + " in " + counter + " tries. Giving up. Load factor: " +
                    loadFactor, ex)
              }
            }
          }
        }

      }

      throw new Exception("Could not retrieve abstract for page: " + pageTitle)
    }

    /**
     * Returns the first sentences of the given text that have less than 500 characters.
     * A sentence ends with a dot followed by whitespace.
     * TODO: probably doesn't work for most non-European languages.
     * TODO: analyse ActiveAbstractExtractor, I think this works  quite well there,
     * because it takes the first two or three sentences
     * @param text
     * @param max max length
     * @return result string
     */
    def short(text : String, max : Int = 500) : String =
    {
        if (text.size < max) return text

        val builder = new StringBuilder()
        var size = 0

        for(sentence <- text.split("""(?<=\.\s)"""))
        {
            if(size + sentence.size > max)
            {
                if (builder.isEmpty)
                {
                    return sentence
                }
                return builder.toString().trim
            }

            size += sentence.size
            builder.append(sentence)
        }

        builder.toString().trim
    }

    /**
     * Get the parsed and cleaned abstract text from the MediaWiki instance input stream.
     * It returns
     * <api> <query> <pages> <page> <extract> ABSTRACT_TEXT <extract> <page> <pages> <query> <api>
     *  ///  <api> <parse> <text> ABSTRACT_TEXT </text> </parse> </api>
     */
    private def readInAbstract(inputStream : InputStream) : String =
    {
      // for XML format
      val xmlAnswer = Source.fromInputStream(inputStream, "UTF-8").getLines().mkString("")
      //val text = (XML.loadString(xmlAnswer) \ "parse" \ "text").text.trim
      var text = (XML.loadString(xmlAnswer) \ "query" \ "pages" \ "page" \ "extract").text.trim
      text = decodeHtml(text)

      for ((regex, replacement) <- AbstractExtractor.patternsToRemove) {
        val matches = regex.pattern.matcher(text)
        if (matches.find()) {
          text = matches.replaceAll(replacement)
        }
      }
      text
    }

    private def postProcess(pageTitle: WikiTitle, text: String): String =
    {
      val startsWithLowercase =
      if (text.isEmpty) {
        false
      } else {
        val firstLetter = text.substring(0,1)
        firstLetter != firstLetter.toUpperCase(context.language.locale)
      }

      //HACK
      if (startsWithLowercase)
      {
        val decodedTitle = pageTitle.decoded.replaceFirst(" \\(.+\\)$", "")

        if (! text.toLowerCase.contains(decodedTitle.toLowerCase))
        {
          // happens mainly for Japanese names (abstract starts with template)
          return decodedTitle + " " + text
        }
      }

      text
    }

    //private val destinationNamespacesToRender = List(Namespace.Main, Namespace.Template)

    /*
    private def renderNode(node : Node) = node match
    {
        case InternalLinkNode(destination, _, _, _) => destinationNamespacesToRender contains destination.namespace
        case ParserFunctionNode(_, _, _) => false
        case _ => true
    }
    */


    /**
     * Get the wiki text that contains the abstract text.
     */
    /*
    def getAbstractWikiText(pageNode : PageNode) : String =
    {
        // From first TextNode
        val start = pageNode.children.indexWhere{
            case TextNode(text, _) => text.trim != ""
            case InternalLinkNode(destination, _, _, _) => destination.namespace == Namespace.Main
            case _ => false
        }

        // To first SectionNode (exclusive)
        var end = pageNode.children.indexWhere{
            case sectionNode : SectionNode => true
            case _ => false
        }

        // If there is no SectionNode, To last non-empty TextNode (inclusive)
        if(end == -1)
        {
            val reverseLastTextIndex = pageNode.children.reverse.indexWhere{
                case TextNode(text, _) => text.trim != ""
                case _ => false
            }
            if(reverseLastTextIndex != -1)
            {
                end = pageNode.children.length - reverseLastTextIndex
            }
        }

        // No result if there is no TextNode or no text before a SectionNode
        if(start == -1 || end == -1 || start >= end)
        {
            return ""
        }

        // Re-generate wiki text for found range of nodes
        val text = pageNode.children.slice(start, end)
                .filter(renderNode)
                .map(_.toWikiText)
                .mkString("").trim
        
        // decode HTML entities - the result is plain text
        decodeHtml(text)
    }
    */

    def decodeHtml(text: String): String = {
      val coder = new HtmlCoder(XmlCodes.NONE)
      coder.setErrorHandler(ParseExceptionIgnorer.INSTANCE)
      coder.code(text)
    }

}

object AbstractExtractor {
  /**
   * List of all characters which are reserved in a query component according to RFC 2396
   * with their escape sequences as determined by the JavaScript function encodeURIComponent.
   */
  val CHARACTERS_TO_ESCAPE = List(
    (";", "%3B"),
    ("/", "%2F"),
    ("?", "%3F"),
    (":", "%3A"),
    ("@", "%40"),
    ("&", "%26"),
    ("=", "%3D"),
    ("+", "%2B"),
    (",", "%2C"),
    ("$", "%24")
  )

  val patternsToRemove = List(
    """<div style=[^/]*/>""".r -> " ",
    """</div>""".r -> " "
  )
}