/third_party/rust/regex/v1/crate/src/re_set.rs
Rust | 475 lines | 176 code | 37 blank | 262 comment | 10 complexity | bff14d00cdf8447b61984281806437cc MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, Apache-2.0, BSD-3-Clause
- macro_rules! define_set {
- ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr,
- $(#[$doc_regexset_example:meta])* ) => {
- pub mod $name {
- use std::fmt;
- use std::iter;
- use std::slice;
- use std::vec;
- use crate::error::Error;
- use crate::exec::Exec;
- use crate::re_builder::$builder_mod::RegexSetBuilder;
- use crate::re_trait::RegularExpression;
- /// Match multiple (possibly overlapping) regular expressions in a single scan.
- ///
- /// A regex set corresponds to the union of two or more regular expressions.
- /// That is, a regex set will match text where at least one of its
- /// constituent regular expressions matches. A regex set as its formulated here
- /// provides a touch more power: it will also report *which* regular
- /// expressions in the set match. Indeed, this is the key difference between
- /// regex sets and a single `Regex` with many alternates, since only one
- /// alternate can match at a time.
- ///
- /// For example, consider regular expressions to match email addresses and
- /// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
- /// regex set is constructed from those regexes, then searching the text
- /// `foo@example.com` will report both regexes as matching. Of course, one
- /// could accomplish this by compiling each regex on its own and doing two
- /// searches over the text. The key advantage of using a regex set is that it
- /// will report the matching regexes using a *single pass through the text*.
- /// If one has hundreds or thousands of regexes to match repeatedly (like a URL
- /// router for a complex web application or a user agent matcher), then a regex
- /// set can realize huge performance gains.
- ///
- /// # Example
- ///
- /// This shows how the above two regexes (for matching email addresses and
- /// domains) might work:
- ///
- $(#[$doc_regexset_example])*
- ///
- /// Note that it would be possible to adapt the above example to using `Regex`
- /// with an expression like:
- ///
- /// ```text
- /// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
- /// ```
- ///
- /// After a match, one could then inspect the capture groups to figure out
- /// which alternates matched. The problem is that it is hard to make this
- /// approach scale when there are many regexes since the overlap between each
- /// alternate isn't always obvious to reason about.
- ///
- /// # Limitations
- ///
- /// Regex sets are limited to answering the following two questions:
- ///
- /// 1. Does any regex in the set match?
- /// 2. If so, which regexes in the set match?
- ///
- /// As with the main `Regex` type, it is cheaper to ask (1) instead of (2)
- /// since the matching engines can stop after the first match is found.
- ///
- /// Other features like finding the location of successive matches or their
- /// sub-captures aren't supported. If you need this functionality, the
- /// recommended approach is to compile each regex in the set independently and
- /// selectively match them based on which regexes in the set matched.
- ///
- /// # Performance
- ///
- /// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
- /// search takes `O(mn)` time, where `m` is proportional to the size of the
- /// regex set and `n` is proportional to the length of the search text.
- #[derive(Clone)]
- pub struct RegexSet(Exec);
- impl RegexSet {
- /// Create a new regex set with the given regular expressions.
- ///
- /// This takes an iterator of `S`, where `S` is something that can produce
- /// a `&str`. If any of the strings in the iterator are not valid regular
- /// expressions, then an error is returned.
- ///
- /// # Example
- ///
- /// Create a new regex set from an iterator of strings:
- ///
- /// ```rust
- /// # use regex::RegexSet;
- /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
- /// assert!(set.is_match("foo"));
- /// ```
- pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
- where S: AsRef<str>, I: IntoIterator<Item=S> {
- RegexSetBuilder::new(exprs).build()
- }
- /// Create a new empty regex set.
- ///
- /// # Example
- ///
- /// ```rust
- /// # use regex::RegexSet;
- /// let set = RegexSet::empty();
- /// assert!(set.is_empty());
- /// ```
- pub fn empty() -> RegexSet {
- RegexSetBuilder::new(&[""; 0]).build().unwrap()
- }
- /// Returns true if and only if one of the regexes in this set matches
- /// the text given.
- ///
- /// This method should be preferred if you only need to test whether any
- /// of the regexes in the set should match, but don't care about *which*
- /// regexes matched. This is because the underlying matching engine will
- /// quit immediately after seeing the first match instead of continuing to
- /// find all matches.
- ///
- /// Note that as with searches using `Regex`, the expression is unanchored
- /// by default. That is, if the regex does not start with `^` or `\A`, or
- /// end with `$` or `\z`, then it is permitted to match anywhere in the
- /// text.
- ///
- /// # Example
- ///
- /// Tests whether a set matches some text:
- ///
- /// ```rust
- /// # use regex::RegexSet;
- /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
- /// assert!(set.is_match("foo"));
- /// assert!(!set.is_match("☃"));
- /// ```
- pub fn is_match(&self, text: $text_ty) -> bool {
- self.is_match_at(text, 0)
- }
- /// Returns the same as is_match, but starts the search at the given
- /// offset.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, the `\A` anchor can only
- /// match when `start == 0`.
- #[doc(hidden)]
- pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool {
- self.0.searcher().is_match_at($as_bytes(text), start)
- }
- /// Returns the set of regular expressions that match in the given text.
- ///
- /// The set returned contains the index of each regular expression that
- /// matches in the given text. The index is in correspondence with the
- /// order of regular expressions given to `RegexSet`'s constructor.
- ///
- /// The set can also be used to iterate over the matched indices.
- ///
- /// Note that as with searches using `Regex`, the expression is unanchored
- /// by default. That is, if the regex does not start with `^` or `\A`, or
- /// end with `$` or `\z`, then it is permitted to match anywhere in the
- /// text.
- ///
- /// # Example
- ///
- /// Tests which regular expressions match the given text:
- ///
- /// ```rust
- /// # use regex::RegexSet;
- /// let set = RegexSet::new(&[
- /// r"\w+",
- /// r"\d+",
- /// r"\pL+",
- /// r"foo",
- /// r"bar",
- /// r"barfoo",
- /// r"foobar",
- /// ]).unwrap();
- /// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
- /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
- ///
- /// // You can also test whether a particular regex matched:
- /// let matches = set.matches("foobar");
- /// assert!(!matches.matched(5));
- /// assert!(matches.matched(6));
- /// ```
- pub fn matches(&self, text: $text_ty) -> SetMatches {
- let mut matches = vec![false; self.0.regex_strings().len()];
- let any = self.read_matches_at(&mut matches, text, 0);
- SetMatches {
- matched_any: any,
- matches: matches,
- }
- }
- /// Returns the same as matches, but starts the search at the given
- /// offset and stores the matches into the slice given.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, the `\A` anchor can only
- /// match when `start == 0`.
- ///
- /// `matches` must have a length that is at least the number of regexes
- /// in this set.
- ///
- /// This method returns true if and only if at least one member of
- /// `matches` is true after executing the set against `text`.
- #[doc(hidden)]
- pub fn read_matches_at(
- &self,
- matches: &mut [bool],
- text: $text_ty,
- start: usize,
- ) -> bool {
- self.0.searcher().many_matches_at(matches, $as_bytes(text), start)
- }
- /// Returns the total number of regular expressions in this set.
- pub fn len(&self) -> usize {
- self.0.regex_strings().len()
- }
- /// Returns `true` if this set contains no regular expressions.
- pub fn is_empty(&self) -> bool {
- self.0.regex_strings().is_empty()
- }
- /// Returns the patterns that this set will match on.
- ///
- /// This function can be used to determine the pattern for a match. The
- /// slice returned has exactly as many patterns givens to this regex set,
- /// and the order of the slice is the same as the order of the patterns
- /// provided to the set.
- ///
- /// # Example
- ///
- /// ```rust
- /// # use regex::RegexSet;
- /// let set = RegexSet::new(&[
- /// r"\w+",
- /// r"\d+",
- /// r"\pL+",
- /// r"foo",
- /// r"bar",
- /// r"barfoo",
- /// r"foobar",
- /// ]).unwrap();
- /// let matches: Vec<_> = set
- /// .matches("foobar")
- /// .into_iter()
- /// .map(|match_idx| &set.patterns()[match_idx])
- /// .collect();
- /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
- /// ```
- pub fn patterns(&self) -> &[String] {
- self.0.regex_strings()
- }
- }
- /// A set of matches returned by a regex set.
- #[derive(Clone, Debug)]
- pub struct SetMatches {
- matched_any: bool,
- matches: Vec<bool>,
- }
- impl SetMatches {
- /// Whether this set contains any matches.
- pub fn matched_any(&self) -> bool {
- self.matched_any
- }
- /// Whether the regex at the given index matched.
- ///
- /// The index for a regex is determined by its insertion order upon the
- /// initial construction of a `RegexSet`, starting at `0`.
- ///
- /// # Panics
- ///
- /// If `regex_index` is greater than or equal to `self.len()`.
- pub fn matched(&self, regex_index: usize) -> bool {
- self.matches[regex_index]
- }
- /// The total number of regexes in the set that created these matches.
- pub fn len(&self) -> usize {
- self.matches.len()
- }
- /// Returns an iterator over indexes in the regex that matched.
- ///
- /// This will always produces matches in ascending order of index, where
- /// the index corresponds to the index of the regex that matched with
- /// respect to its position when initially building the set.
- pub fn iter(&self) -> SetMatchesIter<'_> {
- SetMatchesIter((&*self.matches).into_iter().enumerate())
- }
- }
- impl IntoIterator for SetMatches {
- type IntoIter = SetMatchesIntoIter;
- type Item = usize;
- fn into_iter(self) -> Self::IntoIter {
- SetMatchesIntoIter(self.matches.into_iter().enumerate())
- }
- }
- impl<'a> IntoIterator for &'a SetMatches {
- type IntoIter = SetMatchesIter<'a>;
- type Item = usize;
- fn into_iter(self) -> Self::IntoIter {
- self.iter()
- }
- }
- /// An owned iterator over the set of matches from a regex set.
- ///
- /// This will always produces matches in ascending order of index, where the
- /// index corresponds to the index of the regex that matched with respect to
- /// its position when initially building the set.
- #[derive(Debug)]
- pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>);
- impl Iterator for SetMatchesIntoIter {
- type Item = usize;
- fn next(&mut self) -> Option<usize> {
- loop {
- match self.0.next() {
- None => return None,
- Some((_, false)) => {}
- Some((i, true)) => return Some(i),
- }
- }
- }
- fn size_hint(&self) -> (usize, Option<usize>) {
- self.0.size_hint()
- }
- }
- impl DoubleEndedIterator for SetMatchesIntoIter {
- fn next_back(&mut self) -> Option<usize> {
- loop {
- match self.0.next_back() {
- None => return None,
- Some((_, false)) => {}
- Some((i, true)) => return Some(i),
- }
- }
- }
- }
- impl iter::FusedIterator for SetMatchesIntoIter {}
- /// A borrowed iterator over the set of matches from a regex set.
- ///
- /// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
- ///
- /// This will always produces matches in ascending order of index, where the
- /// index corresponds to the index of the regex that matched with respect to
- /// its position when initially building the set.
- #[derive(Clone, Debug)]
- pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>);
- impl<'a> Iterator for SetMatchesIter<'a> {
- type Item = usize;
- fn next(&mut self) -> Option<usize> {
- loop {
- match self.0.next() {
- None => return None,
- Some((_, &false)) => {}
- Some((i, &true)) => return Some(i),
- }
- }
- }
- fn size_hint(&self) -> (usize, Option<usize>) {
- self.0.size_hint()
- }
- }
- impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
- fn next_back(&mut self) -> Option<usize> {
- loop {
- match self.0.next_back() {
- None => return None,
- Some((_, &false)) => {}
- Some((i, &true)) => return Some(i),
- }
- }
- }
- }
- impl<'a> iter::FusedIterator for SetMatchesIter<'a> {}
- #[doc(hidden)]
- impl From<Exec> for RegexSet {
- fn from(exec: Exec) -> Self {
- RegexSet(exec)
- }
- }
- impl fmt::Debug for RegexSet {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- write!(f, "RegexSet({:?})", self.0.regex_strings())
- }
- }
- #[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() }
- #[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text }
- }
- }
- }
- define_set! {
- unicode,
- set_unicode,
- &str,
- as_bytes_str,
- /// ```rust
- /// # use regex::RegexSet;
- /// let set = RegexSet::new(&[
- /// r"[a-z]+@[a-z]+\.(com|org|net)",
- /// r"[a-z]+\.(com|org|net)",
- /// ]).unwrap();
- ///
- /// // Ask whether any regexes in the set match.
- /// assert!(set.is_match("foo@example.com"));
- ///
- /// // Identify which regexes in the set match.
- /// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
- /// assert_eq!(vec![0, 1], matches);
- ///
- /// // Try again, but with text that only matches one of the regexes.
- /// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
- /// assert_eq!(vec![1], matches);
- ///
- /// // Try again, but with text that doesn't match any regex in the set.
- /// let matches: Vec<_> = set.matches("example").into_iter().collect();
- /// assert!(matches.is_empty());
- /// ```
- }
- define_set! {
- bytes,
- set_bytes,
- &[u8],
- as_bytes_bytes,
- /// ```rust
- /// # use regex::bytes::RegexSet;
- /// let set = RegexSet::new(&[
- /// r"[a-z]+@[a-z]+\.(com|org|net)",
- /// r"[a-z]+\.(com|org|net)",
- /// ]).unwrap();
- ///
- /// // Ask whether any regexes in the set match.
- /// assert!(set.is_match(b"foo@example.com"));
- ///
- /// // Identify which regexes in the set match.
- /// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
- /// assert_eq!(vec![0, 1], matches);
- ///
- /// // Try again, but with text that only matches one of the regexes.
- /// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
- /// assert_eq!(vec![1], matches);
- ///
- /// // Try again, but with text that doesn't match any regex in the set.
- /// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
- /// assert!(matches.is_empty());
- /// ```
- }