/lexnlp/extract/common/definitions/common_definition_patterns.py

https://github.com/LexPredict/lexpredict-lexnlp · Python · 189 lines · 183 code · 4 blank · 2 comment · 0 complexity · 3ef4adc8052fc15aa7a4ab49b2482b58 MD5 · raw file

  1. from typing import List, Match, Callable
  2. import regex as re
  3. from lexnlp.extract.common.definitions.universal_definition_parser import UniversalDefinitionsParser
  4. from lexnlp.extract.common.pattern_found import PatternFound
  5. __author__ = "ContraxSuite, LLC; LexPredict, LLC"
  6. __copyright__ = "Copyright 2015-2020, ContraxSuite, LLC"
  7. __license__ = "https://github.com/LexPredict/lexpredict-lexnlp/blob/1.7.0/LICENSE"
  8. __version__ = "1.7.0"
  9. __maintainer__ = "LexPredict, LLC"
  10. __email__ = "support@contraxsuite.com"
  11. class CommonDefinitionPatterns:
  12. reg_semicolon = re.compile("([\"'“„])(?:(?=(\\\\?))\\2.)*?\\1(?=:)", re.UNICODE | re.IGNORECASE)
  13. reg_quoted = re.compile("([\"'“„])(?:(?=(\\\\?))\\2.)*?\\1", re.UNICODE | re.IGNORECASE)
  14. reg_acronyms = re.compile(r"\(\p{Lu}\p{L}*\p{Lu}\)", re.UNICODE)
  15. @staticmethod
  16. def match_acronyms(phrase: str) -> List[PatternFound]:
  17. """
  18. :param phrase: rompió el silencio tras ser despedido del Canal del Fútbol (CDF).
  19. :return: {name: 'CDF', probability: 100, ...}
  20. """
  21. defs = []
  22. for match in CommonDefinitionPatterns.reg_acronyms.finditer(phrase):
  23. acr_start = CommonDefinitionPatterns.get_acronym_words_start(phrase, match)
  24. if acr_start < 0:
  25. continue
  26. df = PatternFound()
  27. df.name = match.group().strip('() ')
  28. df.start = acr_start
  29. df.end = match.start() - 1
  30. df.probability = 100
  31. defs.append(df)
  32. return defs
  33. @staticmethod
  34. def get_acronym_words_start(phrase: str, match: Match) -> int:
  35. """
  36. each acronym match should be preceded by capitalized words that start from the same letters
  37. :param phrase: "rompió el silencio tras ser despedido del Canal del Fútbol (CDF). "
  38. :param match: "(CDF)" Match object for this example
  39. :return: start letter (42 for this case) index or -1
  40. """
  41. proc = UniversalDefinitionsParser.basic_line_processor
  42. name = match.group().strip('() ').upper()
  43. start = match.start()
  44. words = proc.split_text_on_words(phrase[:start])
  45. if len(words) < 2:
  46. return -1
  47. mistakes = 0
  48. uppercases = 0
  49. acr_index = len(name) - 1
  50. acr_start = words[-1].start
  51. for i in range(len(words) - 1, -1, -1):
  52. if words[i].is_separator:
  53. continue
  54. l = words[i].text[0]
  55. l_upper = l.upper()
  56. is_upper = l_upper == l
  57. if is_upper:
  58. uppercases += 1
  59. is_correct = name[acr_index] == l_upper
  60. if not is_correct:
  61. mistakes += 1
  62. if mistakes > 1:
  63. return -1
  64. continue
  65. acr_start = words[i].start
  66. acr_index -= 1
  67. if acr_index < 0:
  68. break
  69. return acr_start if uppercases > 1 and acr_index < 0 else -1
  70. @staticmethod
  71. def match_es_def_by_semicolon(phrase: str) -> List[PatternFound]:
  72. """
  73. :param phrase: "Modern anatomy human": a human of modern anatomy.
  74. :return: {name: 'Modern anatomy human', probability: 100, ...}
  75. """
  76. prob = 100
  77. defs = []
  78. for match in CommonDefinitionPatterns.reg_semicolon.finditer(phrase):
  79. df = PatternFound()
  80. df.name = match.group()
  81. df.start = 0
  82. df.end = len(phrase)
  83. df.probability = prob
  84. defs.append(df)
  85. prob = 66
  86. return defs
  87. @staticmethod
  88. def peek_quoted_part(phrase: str,
  89. match: Match,
  90. start_func: Callable[[str, Match, Match], int],
  91. end_func: Callable[[str, Match, Match], int],
  92. match_prob: int) -> List[PatternFound]:
  93. """
  94. :param phrase: the whole text, may be used for getting the definition's text length
  95. :param match: the matched part of the phrase that may contain several quote-packed definitions
  96. :param start_func: (phrase, match, quoted_match) -> definition's start
  97. :param end_func: (phrase, match, quoted_match) -> definition's end
  98. :param match_prob: definition's probability
  99. :return: a list of definitions found or an empty list
  100. """
  101. defs = []
  102. text = match.group()
  103. quoted_entries = [m for m in CommonDefinitionPatterns.reg_quoted.finditer(text)]
  104. if len(quoted_entries) == 0:
  105. return defs
  106. for entry in quoted_entries:
  107. df = PatternFound()
  108. df.name = entry.group()
  109. df.start = start_func(phrase, match, entry)
  110. df.end = end_func(phrase, match, entry)
  111. df.probability = match_prob
  112. defs.append(df)
  113. return defs
  114. @staticmethod
  115. def collect_regex_matches_with_quoted_chunks(phrase: str, reg: re, prob: int,
  116. quoted_def_start: Callable[[str, Match, Match], int],
  117. quoted_def_end: Callable[[str, Match, Match], int],
  118. def_start: Callable[[str, Match], int],
  119. def_end: Callable[[str, Match], int]
  120. ) -> List[PatternFound]:
  121. """
  122. First, find all matches by 'reg' ptr
  123. Second, go through matches
  124. For each match try to find a set of quoted words
  125. If found, use them as matches
  126. Or use the whole match
  127. :param quoted_def_start: (phrase, match, quoted_match) -> definition's start
  128. :param quoted_def_end: (phrase, match, quoted_match) -> definition's end
  129. :param def_start: (phrase, match) -> definition's start
  130. :param def_end: (phrase, match) -> definition's end
  131. :return:
  132. """
  133. defs = []
  134. for match in reg.finditer(phrase):
  135. quoted_matches = \
  136. CommonDefinitionPatterns.peek_quoted_part(phrase,
  137. match,
  138. quoted_def_start,
  139. quoted_def_end,
  140. prob)
  141. if len(quoted_matches) > 0:
  142. defs += quoted_matches
  143. continue
  144. df = PatternFound()
  145. df.name = match.group()
  146. df.start = def_start(phrase, match)
  147. df.end = def_end(phrase, match)
  148. df.probability = prob
  149. defs.append(df)
  150. return defs
  151. @staticmethod
  152. def collect_regex_matches(phrase: str, reg: re, prob: int,
  153. def_start: Callable[[str, Match], int],
  154. def_end: Callable[[str, Match], int]
  155. ) -> List[PatternFound]:
  156. """
  157. find all matches by 'reg' ptr
  158. :param quoted_def_start: (phrase, match, quoted_match) -> definition's start
  159. :param quoted_def_end: (phrase, match, quoted_match) -> definition's end
  160. :param def_start: (phrase, match) -> definition's start
  161. :param def_end: (phrase, match) -> definition's end
  162. :return:
  163. """
  164. defs = []
  165. for match in reg.finditer(phrase):
  166. df = PatternFound()
  167. df.name = match.group()
  168. df.start = def_start(phrase, match)
  169. df.end = def_end(phrase, match)
  170. df.probability = prob
  171. defs.append(df)
  172. return defs