PageRenderTime 47ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/NBoilerpipePortable/Filters/Heuristics/KeepLargestBlockFilter.cs

https://github.com/hippiehunter/Baconography
C# | 129 lines | 103 code | 9 blank | 17 comment | 17 complexity | 53bdcd37281111678c07ae77d67928aa MD5 | raw file
  1. /*
  2. * This code is derived from boilerpipe
  3. *
  4. */
  5. using System.Collections.Generic;
  6. using NBoilerpipePortable;
  7. using NBoilerpipePortable.Document;
  8. using NBoilerpipePortable.Labels;
  9. using System.Linq;
  10. namespace NBoilerpipePortable.Filters.Heuristics
  11. {
  12. /// <summary>
  13. /// Keeps the largest
  14. /// <see cref="NBoilerpipePortable.Document.TextBlock">NBoilerpipePortable.Document.TextBlock</see>
  15. /// only (by the number of words). In case of
  16. /// more than one block with the same number of words, the first block is chosen.
  17. /// All discarded blocks are marked "not content" and flagged as
  18. /// <see cref="NBoilerpipePortable.Labels.DefaultLabels.MIGHT_BE_CONTENT">NBoilerpipePortable.Labels.DefaultLabels.MIGHT_BE_CONTENT
  19. /// </see>
  20. /// .
  21. /// Note that, by default, only TextBlocks marked as "content" are taken into consideration.
  22. /// </summary>
  23. /// <author>Christian Kohlschütter</author>
  24. public sealed class KeepLargestBlockFilter : BoilerpipeFilter
  25. {
  26. public static readonly NBoilerpipePortable.Filters.Heuristics.KeepLargestBlockFilter INSTANCE
  27. = new NBoilerpipePortable.Filters.Heuristics.KeepLargestBlockFilter(false, 0);
  28. public static readonly NBoilerpipePortable.Filters.Heuristics.KeepLargestBlockFilter INSTANCE_EXPAND_TO_SAME_TAGLEVEL
  29. = new NBoilerpipePortable.Filters.Heuristics.KeepLargestBlockFilter(true, 0);
  30. public static readonly NBoilerpipePortable.Filters.Heuristics.KeepLargestBlockFilter INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS
  31. = new NBoilerpipePortable.Filters.Heuristics.KeepLargestBlockFilter(true, 150);
  32. private readonly bool expandToSameLevelText;
  33. private readonly int minWords;
  34. public KeepLargestBlockFilter(bool expandToSameLevelText, int minWords)
  35. {
  36. this.minWords = minWords;
  37. this.expandToSameLevelText = expandToSameLevelText;
  38. }
  39. /// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
  40. public bool Process(TextDocument doc)
  41. {
  42. IList<TextBlock> textBlocks = doc.GetTextBlocks();
  43. if (textBlocks.Count < 2)
  44. {
  45. return false;
  46. }
  47. int maxNumWords = -1;
  48. TextBlock largestBlock = null;
  49. int level = -1;
  50. int i = 0;
  51. int n = -1;
  52. foreach (TextBlock tb in textBlocks)
  53. {
  54. if (tb.IsContent())
  55. {
  56. int nw = tb.GetNumWords();
  57. if (nw > maxNumWords)
  58. {
  59. largestBlock = tb;
  60. maxNumWords = nw;
  61. n = i;
  62. if (expandToSameLevelText)
  63. {
  64. level = tb.GetTagLevel();
  65. }
  66. }
  67. }
  68. i++;
  69. }
  70. foreach (TextBlock tb in textBlocks)
  71. {
  72. if (tb == largestBlock)
  73. {
  74. tb.SetIsContent(true);
  75. }
  76. else
  77. {
  78. tb.SetIsContent(false);
  79. tb.AddLabel(DefaultLabels.MIGHT_BE_CONTENT);
  80. }
  81. }
  82. if (expandToSameLevelText && n != -1)
  83. {
  84. foreach (var tb in textBlocks.Take(n).Reverse())
  85. {
  86. int tl = tb.GetTagLevel();
  87. if (tl < level)
  88. {
  89. break;
  90. }
  91. else
  92. {
  93. if (tl == level)
  94. {
  95. if(tb.GetNumWords() >= minWords)
  96. tb.SetIsContent(true);
  97. }
  98. }
  99. }
  100. foreach (var tb in textBlocks.Skip(n))
  101. {
  102. int tl = tb.GetTagLevel();
  103. if (tl < level)
  104. {
  105. break;
  106. }
  107. else
  108. {
  109. if (tl == level)
  110. {
  111. if (tb.GetNumWords() >= minWords)
  112. tb.SetIsContent(true);
  113. }
  114. }
  115. }
  116. }
  117. return true;
  118. }
  119. }
  120. }