/coala_utils/string_processing/Core.py

https://gitlab.com/SanketDG/coala-utils · Python · 513 lines · 338 code · 25 blank · 150 comment · 27 complexity · 7910dc4188f62da601ae6bcc7a3f614a MD5 · raw file

  1. import re
  2. from coala_utils.string_processing import InBetweenMatch
  3. from coala_utils.string_processing.Filters import (limit,
  4. trim_empty_matches)
  5. def search_for(pattern, string, flags=0, max_match=0, use_regex=False):
  6. """
  7. Searches for a given pattern in a string.
  8. :param pattern: A pattern that defines what to match.
  9. :param string: The string to search in.
  10. :param flags: Additional flags to pass to the regex processor.
  11. :param max_match: Defines the maximum number of matches to perform. If 0 or
  12. less is provided, the number of splits is not limited.
  13. :param use_regex: Specifies whether to treat the pattern as a regex or
  14. simple string.
  15. :return: An iterator returning MatchObject's.
  16. """
  17. if not use_regex:
  18. pattern = re.escape(pattern)
  19. return limit(re.finditer(pattern, string, flags), max_match)
  20. def unescaped_search_for(pattern,
  21. string,
  22. flags=0,
  23. max_match=0,
  24. use_regex=False):
  25. """
  26. Searches for a given pattern in a string that is not escaped.
  27. :param pattern: A pattern that defines what to match unescaped.
  28. :param string: The string to search in.
  29. :param flags: Additional flags to pass to the regex processor.
  30. :param max_match: Defines the maximum number of matches to perform. If 0 or
  31. less is provided, the number of splits is not limited.
  32. :param use_regex: Specifies whether to treat the pattern as a regex or
  33. simple string.
  34. :return: An iterator returning MatchObject's.
  35. """
  36. _iter = limit(
  37. filter(lambda match: not position_is_escaped(string, match.start()),
  38. search_for(pattern, string, flags, 0, use_regex)),
  39. max_match)
  40. for elem in _iter:
  41. yield elem
  42. def _split(string,
  43. max_split,
  44. remove_empty_matches,
  45. matching_function,
  46. *args,
  47. **kwargs):
  48. """
  49. Splits a string using a given matching-function that matches the separator.
  50. This function implements general features needed from the split functions
  51. in this module (the max-split and remove-empty-matches features).
  52. :param string: The string where to split.
  53. :param max_split: Defines the maximum number of splits. If 0 or
  54. less is provided, the number of splits is not
  55. limited.
  56. :param remove_empty_matches: Defines whether empty entries should
  57. be removed from the result.
  58. :param matching_function: The matching function. It must return
  59. MatchObject's containing the matched
  60. split-separator.
  61. :param args: Positional arguments to invoke the
  62. matching_function with.
  63. :param kwargs: Key-value arguments to invoke the
  64. matching_function with.
  65. """
  66. last_end_pos = 0
  67. for match in matching_function(*args, **kwargs):
  68. split_string = string[last_end_pos: match.start()]
  69. last_end_pos = match.end()
  70. if not remove_empty_matches or len(split_string) != 0:
  71. yield split_string
  72. max_split -= 1
  73. if max_split == 0:
  74. break # only reachable when max_split > 0
  75. # Append the rest of the string.
  76. if not remove_empty_matches or len(string) > last_end_pos:
  77. yield string[last_end_pos:]
  78. def split(pattern,
  79. string,
  80. max_split=0,
  81. remove_empty_matches=False,
  82. use_regex=False):
  83. """
  84. Splits the given string by the specified pattern. The return character (\\n)
  85. is not a natural split pattern (if you don't specify it yourself).
  86. This function ignores escape sequences.
  87. :param pattern: A pattern that defines where to split.
  88. :param string: The string to split by the defined pattern.
  89. :param max_split: Defines the maximum number of splits. If 0 or
  90. less is provided, the number of splits is not
  91. limited.
  92. :param remove_empty_matches: Defines whether empty entries should
  93. be removed from the result.
  94. :param use_regex: Specifies whether to treat the split pattern
  95. as a regex or simple string.
  96. :return: An iterator returning the split up strings.
  97. """
  98. return _split(string,
  99. max_split,
  100. remove_empty_matches,
  101. search_for,
  102. pattern,
  103. string,
  104. 0,
  105. 0,
  106. use_regex)
  107. def unescaped_split(pattern,
  108. string,
  109. max_split=0,
  110. remove_empty_matches=False,
  111. use_regex=False):
  112. """
  113. Splits the given string by the specified pattern. The return character (\\n)
  114. is not a natural split pattern (if you don't specify it yourself).
  115. This function handles escaped split-patterns (and so splits only patterns
  116. that are unescaped).
  117. :param pattern: A pattern that defines where to split.
  118. :param string: The string to split by the defined pattern.
  119. :param max_split: Defines the maximum number of splits. If 0 or
  120. less is provided, the number of splits is not
  121. limited.
  122. :param remove_empty_matches: Defines whether empty entries should
  123. be removed from the result.
  124. :param use_regex: Specifies whether to treat the split pattern
  125. as a regex or simple string.
  126. :return: An iterator returning the split up strings.
  127. """
  128. return _split(string,
  129. max_split,
  130. remove_empty_matches,
  131. unescaped_search_for,
  132. pattern,
  133. string,
  134. 0,
  135. 0,
  136. use_regex)
  137. def search_in_between(begin,
  138. end,
  139. string,
  140. max_matches=0,
  141. remove_empty_matches=False,
  142. use_regex=False):
  143. """
  144. Searches for a string enclosed between a specified begin- and end-sequence.
  145. Also enclosed \\n are put into the result. Doesn't handle escape sequences.
  146. :param begin: A pattern that defines where to start
  147. matching.
  148. :param end: A pattern that defines where to end matching.
  149. :param string: The string where to search in.
  150. :param max_matches: Defines the maximum number of matches. If 0 or
  151. less is provided, the number of matches is not
  152. limited.
  153. :param remove_empty_matches: Defines whether empty entries should
  154. be removed from the result. An entry is
  155. considered empty if no inner match was
  156. performed (regardless of matched start and
  157. end patterns).
  158. :param use_regex: Specifies whether to treat the begin and end
  159. patterns as regexes or simple strings.
  160. :return: An iterator returning InBetweenMatch objects
  161. that hold information about the matched begin,
  162. inside and end string matched.
  163. """
  164. if not use_regex:
  165. begin = re.escape(begin)
  166. end = re.escape(end)
  167. # No need to compile the begin sequence, capturing groups get escaped.
  168. begin_pattern_groups = 0
  169. else:
  170. # Compilation of the begin sequence is needed to get the number of
  171. # capturing groups in it.
  172. begin_pattern_groups = re.compile(begin).groups
  173. # Regex explanation:
  174. # 1. (begin) A capturing group that matches the begin sequence.
  175. # 2. (.*?) Match any char unlimited times, as few times as possible. Save
  176. # the match in the second capturing group (`match.group(2)`).
  177. # 3. (end) A capturing group that matches the end sequence.
  178. # Because the previous group is lazy (matches as few times as
  179. # possible) the next occurring end-sequence is matched.
  180. regex = "(" + begin + ")(.*?)(" + end + ")"
  181. matches = re.finditer(regex, string, re.DOTALL)
  182. if remove_empty_matches:
  183. matches = trim_empty_matches(matches,
  184. (begin_pattern_groups + 2,))
  185. matches = limit(matches, max_matches)
  186. for m in matches:
  187. yield InBetweenMatch.from_values(m.group(1),
  188. m.start(1),
  189. m.group(begin_pattern_groups + 2),
  190. m.start(begin_pattern_groups + 2),
  191. m.group(begin_pattern_groups + 3),
  192. m.start(begin_pattern_groups + 3))
  193. def unescaped_search_in_between(begin,
  194. end,
  195. string,
  196. max_matches=0,
  197. remove_empty_matches=False,
  198. use_regex=False):
  199. """
  200. Searches for a string enclosed between a specified begin- and end-sequence.
  201. Also enclosed \\n are put into the result.
  202. Handles escaped begin- and end-sequences (and so only patterns that are
  203. unescaped).
  204. .. warning::
  205. Using the escape character '\\' in the begin- or end-sequences
  206. the function can return strange results. The backslash can
  207. interfere with the escaping regex-sequence used internally to
  208. match the enclosed string.
  209. :param begin: A regex pattern that defines where to start
  210. matching.
  211. :param end: A regex pattern that defines where to end
  212. matching.
  213. :param string: The string where to search in.
  214. :param max_matches: Defines the maximum number of matches. If 0 or
  215. less is provided, the number of matches is not
  216. limited.
  217. :param remove_empty_matches: Defines whether empty entries should
  218. be removed from the result. An entry is
  219. considered empty if no inner match was
  220. performed (regardless of matched start and
  221. end patterns).
  222. :param use_regex: Specifies whether to treat the begin and end
  223. patterns as regexes or simple strings.
  224. :return: An iterator returning the matched strings.
  225. """
  226. if not use_regex:
  227. begin = re.escape(begin)
  228. end = re.escape(end)
  229. # No need to compile the begin sequence, capturing groups get escaped.
  230. begin_pattern_groups = 0
  231. else:
  232. # Compilation of the begin sequence is needed to get the number of
  233. # capturing groups in it.
  234. begin_pattern_groups = re.compile(begin).groups
  235. # Regex explanation:
  236. # 1. (?<!\\)(?:\\\\)* Unescapes the following char. The first part of
  237. # this regex is a look-behind assertion. Only match
  238. # the following if no single backslash is before it.
  239. # The second part matches all double backslashes.
  240. # In fact this sequence matches all escapes that
  241. # occur as a multiple of two, means the following
  242. # statement is not escaped.
  243. # 2. (begin) A capturing group that matches the begin sequence.
  244. # 3. (.*?) Match any char unlimited times, as few times as
  245. # possible. Save the match in the capturing group
  246. # after all capturing groups that can appear in
  247. # 'begin'.
  248. # 4. (?<!\\)((?:\\\\)*) Again the unescaping regex, but now all escape-
  249. # characters get captured.
  250. # 5. (end) A capturing group that matches the end sequence.
  251. # Because the 3. group is lazy (matches as few times
  252. # as possible) the next occurring end-sequence is
  253. # matched.
  254. regex = (r"(?<!\\)(?:\\\\)*(" + begin + r")(.*?)(?<!\\)((?:\\\\)*)(" +
  255. end + ")")
  256. matches = re.finditer(regex, string, re.DOTALL)
  257. if remove_empty_matches:
  258. matches = trim_empty_matches(matches,
  259. (begin_pattern_groups + 2,
  260. begin_pattern_groups + 3))
  261. matches = limit(matches, max_matches)
  262. for m in matches:
  263. yield InBetweenMatch.from_values(m.group(1),
  264. m.start(1),
  265. m.group(begin_pattern_groups + 2) +
  266. m.group(begin_pattern_groups + 3),
  267. m.start(begin_pattern_groups + 2),
  268. m.group(begin_pattern_groups + 4),
  269. m.start(begin_pattern_groups + 4))
  270. def escape(string, escape_chars, escape_with="\\"):
  271. """
  272. Escapes all chars given inside the given string.
  273. :param string: The string where to escape characters.
  274. :param escape_chars: The string or Iterable that contains the characters
  275. to escape. Each char inside this string will be
  276. escaped in the order given. Duplicate chars are
  277. allowed.
  278. :param escape_with: The string that should be used as escape sequence.
  279. :return: The escaped string.
  280. """
  281. for chr in escape_chars:
  282. string = string.replace(chr, escape_with + chr)
  283. return string
  284. def convert_to_raw(string, exceptions=""):
  285. """
  286. Converts a string to its raw form, converting all backslash to double
  287. backslash except when the backslash escapes a character given in
  288. exceptions.
  289. :param string: The given string that needs to be converted
  290. :param exceptions: A list of characters that if escaped with backslash
  291. should not be converted to double backslash.
  292. :return: Returns the corresponding raw string.
  293. """
  294. i = 0
  295. length = len(string)
  296. output = ""
  297. while i < length:
  298. if (string[i] == '\\' and
  299. i + 1 < length and string[i + 1] not in exceptions):
  300. output += "\\"
  301. # If the next character is a ``\`` then we need to write it now
  302. # itself since otherwise it will be interpreted as a newly started
  303. # escape sequence - thereby escaping the character at i + 2,
  304. # which is unintended behavior
  305. if string[i + 1] == '\\':
  306. i += 1
  307. output += string[i]
  308. i += 1
  309. return output
  310. def unescape(string):
  311. """
  312. Trimms off all escape characters from the given string.
  313. :param string: The string to unescape.
  314. """
  315. regex = r"\\(.)|\\$"
  316. return re.sub(regex, lambda m: m.group(1), string, 0, re.DOTALL)
  317. def position_is_escaped(string, position=None):
  318. """
  319. Checks whether a char at a specific position of the string is preceded by
  320. an odd number of backslashes.
  321. :param string: Arbitrary string
  322. :param position: Position of character in string that should be checked
  323. :return: True if the character is escaped, False otherwise
  324. """
  325. escapes_uneven = False
  326. # iterate backwards, starting one left of position.
  327. # Slicing provides a sane default behaviour and prevents IndexErrors
  328. for i in range(len(string[:position]) - 1, -1, -1):
  329. if string[i] == '\\':
  330. escapes_uneven = not escapes_uneven
  331. else:
  332. break
  333. return escapes_uneven
  334. def unescaped_rstrip(string):
  335. """
  336. Strips whitespaces from the right side of given string that are not
  337. escaped.
  338. :param string: The string where to strip whitespaces from.
  339. :return: The right-stripped string.
  340. """
  341. stripped = string.rstrip()
  342. if (len(string) > len(stripped) and
  343. position_is_escaped(stripped, len(string))):
  344. stripped += string[len(stripped)]
  345. return stripped
  346. def unescaped_strip(string):
  347. """
  348. Strips whitespaces of the given string taking escape characters into
  349. account.
  350. :param string: The string where to strip whitespaces from.
  351. :return: The stripped string.
  352. """
  353. return unescaped_rstrip(string).lstrip()
  354. def _nested_search_in_between(begin, end, string):
  355. """
  356. Searches for a string enclosed between a specified begin- and end-sequence.
  357. Matches infinite times.
  358. This is a function specifically designed to be invoked from
  359. ``nested_search_in_between()``.
  360. :param begin: A regex pattern that defines where to start matching.
  361. :param end: A regex pattern that defines where to end matching.
  362. :param string: The string where to search in.
  363. :return: An iterator returning the matched strings.
  364. """
  365. # Regex explanation:
  366. # 1. (begin) A capturing group that matches the begin sequence.
  367. # 2. (end) A capturing group that matches the end sequence. Because the
  368. # 1st group is lazy (matches as few times as possible) the next
  369. # occurring end-sequence is matched.
  370. # The '|' in the regex matches either the first or the second part.
  371. regex = "(" + begin + ")|(" + end + ")"
  372. left_match = None
  373. nesting_level = 0
  374. for match in re.finditer(regex, string, re.DOTALL):
  375. if match.group(1) is not None:
  376. if nesting_level == 0:
  377. # Store the match of the first nesting level to be able to
  378. # return the string until the next fitting end sequence.
  379. left_match = match
  380. nesting_level += 1
  381. else:
  382. # The second group matched. This is the only alternative if group 1
  383. # didn't, otherwise no match would be performed. No need to compile
  384. # the begin and end sequences to get the number of capturing groups
  385. # in them.
  386. if nesting_level > 0:
  387. nesting_level -= 1
  388. if nesting_level == 0 and left_match != None:
  389. yield InBetweenMatch.from_values(
  390. left_match.group(),
  391. left_match.start(),
  392. string[left_match.end(): match.start()],
  393. left_match.end(),
  394. match.group(),
  395. match.start())
  396. left_match = None
  397. def nested_search_in_between(begin,
  398. end,
  399. string,
  400. max_matches=0,
  401. remove_empty_matches=False,
  402. use_regex=False):
  403. """
  404. Searches for a string enclosed between a specified begin- and end-sequence.
  405. Also enclosed \\n are put into the result. Doesn't handle escape sequences,
  406. but supports nesting.
  407. Nested sequences are ignored during the match. Means you get only the first
  408. nesting level returned. If you want to acquire more levels, just reinvoke
  409. this function again on the return value.
  410. Using the same begin- and end-sequence won't match anything.
  411. :param begin: A pattern that defines where to start
  412. matching.
  413. :param end: A pattern that defines where to end matching.
  414. :param string: The string where to search in.
  415. :param max_matches: Defines the maximum number of matches. If 0 or
  416. less is provided, the number of splits is not
  417. limited.
  418. :param remove_empty_matches: Defines whether empty entries should
  419. be removed from the result. An entry is
  420. considered empty if no inner match was
  421. performed (regardless of matched start and
  422. end patterns).
  423. :param use_regex: Specifies whether to treat the begin and end
  424. patterns as regexes or simple strings.
  425. :return: An iterator returning the matched strings.
  426. """
  427. if not use_regex:
  428. begin = re.escape(begin)
  429. end = re.escape(end)
  430. strings = _nested_search_in_between(begin, end, string)
  431. if remove_empty_matches:
  432. strings = filter(lambda x: str(x.inside) != "", strings)
  433. return limit(strings, max_matches)