/code/features-convert/pubdate/src/main/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java

https://github.com/MarginaliaSearch/MarginaliaSearch · Java · 148 lines · 107 code · 32 blank · 9 comment · 27 complexity · 5943e200bb84b9e1d2d26047b29ec055 MD5 · raw file

  1. package nu.marginalia.pubdate.heuristic;
  2. import nu.marginalia.converting.model.HtmlStandard;
  3. import nu.marginalia.model.crawl.PubDate;
  4. import nu.marginalia.pubdate.PubDateHeuristic;
  5. import nu.marginalia.pubdate.PubDateParser;
  6. import nu.marginalia.model.EdgeUrl;
  7. import nu.marginalia.pubdate.PubDateEffortLevel;
  8. import org.jetbrains.annotations.NotNull;
  9. import org.jsoup.nodes.Document;
  10. import org.jsoup.nodes.Element;
  11. import org.jsoup.nodes.Node;
  12. import org.jsoup.nodes.TextNode;
  13. import org.jsoup.select.NodeFilter;
  14. import java.util.Optional;
  15. public class PubDateHeuristicDOMParsingPass1 implements PubDateHeuristic {
  16. @Override
  17. public Optional<PubDate> apply(PubDateEffortLevel effortLevel, String headers, EdgeUrl url, Document document, HtmlStandard htmlStandard) {
  18. if (effortLevel == PubDateEffortLevel.LOW)
  19. return Optional.empty();
  20. DateExtractingNodeVisitorPass filter = new DateExtractingNodeVisitorPass(htmlStandard);
  21. document.filter(filter);
  22. return Optional.ofNullable(filter.pubDate);
  23. }
  24. private static class DateExtractingNodeVisitorPass implements NodeFilter {
  25. public PubDate pubDate;
  26. private final HtmlStandard htmlStandard;
  27. private DateExtractingNodeVisitorPass(HtmlStandard htmlStandard) {
  28. this.htmlStandard = htmlStandard;
  29. }
  30. @NotNull
  31. @Override
  32. public FilterResult head(@NotNull Node node, int depth) {
  33. if (node instanceof TextNode tn) onTextNode(tn);
  34. if (node instanceof Element el) onElementNode(el);
  35. if (hasPubDate()) {
  36. return FilterResult.STOP;
  37. }
  38. return FilterResult.CONTINUE;
  39. }
  40. public void onTextNode(TextNode tn) {
  41. String text = tn.getWholeText();
  42. if (text.length() < 32 && isCandidatForCopyrightNotice(text)) {
  43. parse(text);
  44. }
  45. }
  46. public void onElementNode(Element el) {
  47. if (hasCommonClass(el)) {
  48. parse(el.text());
  49. }
  50. if (!hasPubDate())
  51. tryParsePhpBBDate(el);
  52. }
  53. public boolean isCandidatForCopyrightNotice(String text) {
  54. if (text.contains("ublished"))
  55. return true;
  56. if (text.contains("opyright"))
  57. return true;
  58. if (text.contains("&copy;"))
  59. return true;
  60. if (text.contains("(c)"))
  61. return true;
  62. return false;
  63. }
  64. public boolean hasCommonClass(Element el) {
  65. var classes = el.classNames();
  66. return classes.contains("entry-meta") // wordpress
  67. || classes.contains("byline")
  68. || classes.contains("author")
  69. || classes.contains("submitted")
  70. || el.id().contains("footer-info-lastmod"); // mediawiki
  71. }
  72. public void tryParsePhpBBDate(Element el) {
  73. /* Match HTML on the form <div>[...] <b>Posted:</b> Sun Oct 03, 2010 5:37 pm&nbsp;</div>
  74. * this is used on old phpBB message boards
  75. *
  76. * Schematically the DOM looks like this
  77. *
  78. * b - TextNode[ Sun Oct 03, 2010 5:37 pm&nbsp;]
  79. * |
  80. * TextNode[Posted:]
  81. */
  82. if ("b".equals(el.tagName())
  83. && el.childNodeSize() == 1
  84. && el.childNode(0) instanceof TextNode ctn
  85. && "Posted:".equals(ctn.getWholeText())
  86. && el.nextSibling() instanceof TextNode ntn
  87. )
  88. {
  89. parse(ntn.getWholeText());
  90. }
  91. }
  92. public boolean hasPubDate() {
  93. return pubDate != null;
  94. }
  95. public void setPubDate(PubDate pubDate) {
  96. this.pubDate = pubDate;
  97. }
  98. @NotNull
  99. @Override
  100. public FilterResult tail(@NotNull Node node, int depth) {
  101. return FilterResult.CONTINUE;
  102. }
  103. private void parse(String text) {
  104. if (htmlStandard == HtmlStandard.UNKNOWN) {
  105. PubDateParser
  106. .dateFromHighestYearLookingSubstring(text)
  107. .ifPresent(this::setPubDate);
  108. }
  109. else {
  110. PubDateParser
  111. .dateFromHighestYearLookingSubstringWithGuess(text, htmlStandard.yearGuess)
  112. .ifPresent(this::setPubDate);
  113. }
  114. }
  115. }
  116. }