PageRenderTime 48ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/NBoilerpipePortable/Filters/Heuristics/ExpandTitleToContentFilter.cs

https://github.com/hippiehunter/Baconography
C# | 72 lines | 49 code | 5 blank | 18 comment | 10 complexity | cee5d4ac573e3e7975a5d7ea83caf084 MD5 | raw file
  1. /*
  2. * This code is derived from boilerpipe
  3. *
  4. */
  5. using NBoilerpipePortable;
  6. using NBoilerpipePortable.Document;
  7. using NBoilerpipePortable.Filters.Heuristics;
  8. using NBoilerpipePortable.Labels;
  9. using Sharpen;
  10. namespace NBoilerpipePortable.Filters.Heuristics
  11. {
  12. /// <summary>
  13. /// Marks all
  14. /// <see cref="NBoilerpipePortable.Document.TextBlock">NBoilerpipePortable.Document.TextBlock</see>
  15. /// s "content" which are between the headline and the part that
  16. /// has already been marked content, if they are marked
  17. /// <see cref="NBoilerpipePortable.Labels.DefaultLabels.MIGHT_BE_CONTENT">NBoilerpipePortable.Labels.DefaultLabels.MIGHT_BE_CONTENT
  18. /// </see>
  19. /// .
  20. /// This filter is quite specific to the news domain.
  21. /// </summary>
  22. /// <author>Christian Kohlschütter</author>
  23. public sealed class ExpandTitleToContentFilter : BoilerpipeFilter
  24. {
  25. public static readonly ExpandTitleToContentFilter INSTANCE = new ExpandTitleToContentFilter
  26. ();
  27. /// <summary>Returns the singleton instance for ExpandTitleToContentFilter.</summary>
  28. /// <remarks>Returns the singleton instance for ExpandTitleToContentFilter.</remarks>
  29. public static ExpandTitleToContentFilter GetInstance()
  30. {
  31. return INSTANCE;
  32. }
  33. /// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException"></exception>
  34. public bool Process(TextDocument doc)
  35. {
  36. int i = 0;
  37. int title = -1;
  38. int contentStart = -1;
  39. foreach (TextBlock tb in doc.GetTextBlocks())
  40. {
  41. if (contentStart == -1 && tb.HasLabel(DefaultLabels.TITLE))
  42. {
  43. title = i;
  44. contentStart = -1;
  45. }
  46. if (contentStart == -1 && tb.IsContent())
  47. {
  48. contentStart = i;
  49. }
  50. i++;
  51. }
  52. if (contentStart <= title || title == -1)
  53. {
  54. return false;
  55. }
  56. bool changes = false;
  57. foreach (TextBlock tb_1 in doc.GetTextBlocks().SubList(title, contentStart))
  58. {
  59. if (tb_1.HasLabel(DefaultLabels.MIGHT_BE_CONTENT))
  60. {
  61. changes = tb_1.SetIsContent(true) | changes;
  62. }
  63. }
  64. return changes;
  65. }
  66. }
  67. }