PageRenderTime 2ms CodeModel.GetById 22ms app.highlight 10ms RepoModel.GetById 1ms app.codeStats 0ms

/third_party/rust/regex/v1/crate/src/re_set.rs

https://github.com/chromium/chromium
Rust | 475 lines | 176 code | 37 blank | 262 comment | 10 complexity | bff14d00cdf8447b61984281806437cc MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, Apache-2.0, BSD-3-Clause
  1macro_rules! define_set {
  2    ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr,
  3     $(#[$doc_regexset_example:meta])* ) => {
  4        pub mod $name {
  5            use std::fmt;
  6            use std::iter;
  7            use std::slice;
  8            use std::vec;
  9
 10            use crate::error::Error;
 11            use crate::exec::Exec;
 12            use crate::re_builder::$builder_mod::RegexSetBuilder;
 13            use crate::re_trait::RegularExpression;
 14
 15/// Match multiple (possibly overlapping) regular expressions in a single scan.
 16///
 17/// A regex set corresponds to the union of two or more regular expressions.
 18/// That is, a regex set will match text where at least one of its
 19/// constituent regular expressions matches. A regex set as its formulated here
 20/// provides a touch more power: it will also report *which* regular
 21/// expressions in the set match. Indeed, this is the key difference between
 22/// regex sets and a single `Regex` with many alternates, since only one
 23/// alternate can match at a time.
 24///
 25/// For example, consider regular expressions to match email addresses and
 26/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
 27/// regex set is constructed from those regexes, then searching the text
 28/// `foo@example.com` will report both regexes as matching. Of course, one
 29/// could accomplish this by compiling each regex on its own and doing two
 30/// searches over the text. The key advantage of using a regex set is that it
 31/// will report the matching regexes using a *single pass through the text*.
 32/// If one has hundreds or thousands of regexes to match repeatedly (like a URL
 33/// router for a complex web application or a user agent matcher), then a regex
 34/// set can realize huge performance gains.
 35///
 36/// # Example
 37///
 38/// This shows how the above two regexes (for matching email addresses and
 39/// domains) might work:
 40///
 41$(#[$doc_regexset_example])*
 42///
 43/// Note that it would be possible to adapt the above example to using `Regex`
 44/// with an expression like:
 45///
 46/// ```text
 47/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
 48/// ```
 49///
 50/// After a match, one could then inspect the capture groups to figure out
 51/// which alternates matched. The problem is that it is hard to make this
 52/// approach scale when there are many regexes since the overlap between each
 53/// alternate isn't always obvious to reason about.
 54///
 55/// # Limitations
 56///
 57/// Regex sets are limited to answering the following two questions:
 58///
 59/// 1. Does any regex in the set match?
 60/// 2. If so, which regexes in the set match?
 61///
 62/// As with the main `Regex` type, it is cheaper to ask (1) instead of (2)
 63/// since the matching engines can stop after the first match is found.
 64///
 65/// Other features like finding the location of successive matches or their
 66/// sub-captures aren't supported. If you need this functionality, the
 67/// recommended approach is to compile each regex in the set independently and
 68/// selectively match them based on which regexes in the set matched.
 69///
 70/// # Performance
 71///
 72/// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
 73/// search takes `O(mn)` time, where `m` is proportional to the size of the
 74/// regex set and `n` is proportional to the length of the search text.
 75#[derive(Clone)]
 76pub struct RegexSet(Exec);
 77
 78impl RegexSet {
 79    /// Create a new regex set with the given regular expressions.
 80    ///
 81    /// This takes an iterator of `S`, where `S` is something that can produce
 82    /// a `&str`. If any of the strings in the iterator are not valid regular
 83    /// expressions, then an error is returned.
 84    ///
 85    /// # Example
 86    ///
 87    /// Create a new regex set from an iterator of strings:
 88    ///
 89    /// ```rust
 90    /// # use regex::RegexSet;
 91    /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
 92    /// assert!(set.is_match("foo"));
 93    /// ```
 94    pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
 95            where S: AsRef<str>, I: IntoIterator<Item=S> {
 96        RegexSetBuilder::new(exprs).build()
 97    }
 98
 99    /// Create a new empty regex set.
100    ///
101    /// # Example
102    ///
103    /// ```rust
104    /// # use regex::RegexSet;
105    /// let set = RegexSet::empty();
106    /// assert!(set.is_empty());
107    /// ```
108    pub fn empty() -> RegexSet {
109        RegexSetBuilder::new(&[""; 0]).build().unwrap()
110    }
111
112    /// Returns true if and only if one of the regexes in this set matches
113    /// the text given.
114    ///
115    /// This method should be preferred if you only need to test whether any
116    /// of the regexes in the set should match, but don't care about *which*
117    /// regexes matched. This is because the underlying matching engine will
118    /// quit immediately after seeing the first match instead of continuing to
119    /// find all matches.
120    ///
121    /// Note that as with searches using `Regex`, the expression is unanchored
122    /// by default. That is, if the regex does not start with `^` or `\A`, or
123    /// end with `$` or `\z`, then it is permitted to match anywhere in the
124    /// text.
125    ///
126    /// # Example
127    ///
128    /// Tests whether a set matches some text:
129    ///
130    /// ```rust
131    /// # use regex::RegexSet;
132    /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
133    /// assert!(set.is_match("foo"));
134    /// assert!(!set.is_match("☃"));
135    /// ```
136    pub fn is_match(&self, text: $text_ty) -> bool {
137        self.is_match_at(text, 0)
138    }
139
140    /// Returns the same as is_match, but starts the search at the given
141    /// offset.
142    ///
143    /// The significance of the starting point is that it takes the surrounding
144    /// context into consideration. For example, the `\A` anchor can only
145    /// match when `start == 0`.
146    #[doc(hidden)]
147    pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool {
148        self.0.searcher().is_match_at($as_bytes(text), start)
149    }
150
151    /// Returns the set of regular expressions that match in the given text.
152    ///
153    /// The set returned contains the index of each regular expression that
154    /// matches in the given text. The index is in correspondence with the
155    /// order of regular expressions given to `RegexSet`'s constructor.
156    ///
157    /// The set can also be used to iterate over the matched indices.
158    ///
159    /// Note that as with searches using `Regex`, the expression is unanchored
160    /// by default. That is, if the regex does not start with `^` or `\A`, or
161    /// end with `$` or `\z`, then it is permitted to match anywhere in the
162    /// text.
163    ///
164    /// # Example
165    ///
166    /// Tests which regular expressions match the given text:
167    ///
168    /// ```rust
169    /// # use regex::RegexSet;
170    /// let set = RegexSet::new(&[
171    ///     r"\w+",
172    ///     r"\d+",
173    ///     r"\pL+",
174    ///     r"foo",
175    ///     r"bar",
176    ///     r"barfoo",
177    ///     r"foobar",
178    /// ]).unwrap();
179    /// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
180    /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
181    ///
182    /// // You can also test whether a particular regex matched:
183    /// let matches = set.matches("foobar");
184    /// assert!(!matches.matched(5));
185    /// assert!(matches.matched(6));
186    /// ```
187    pub fn matches(&self, text: $text_ty) -> SetMatches {
188        let mut matches = vec![false; self.0.regex_strings().len()];
189        let any = self.read_matches_at(&mut matches, text, 0);
190        SetMatches {
191            matched_any: any,
192            matches: matches,
193        }
194    }
195
196    /// Returns the same as matches, but starts the search at the given
197    /// offset and stores the matches into the slice given.
198    ///
199    /// The significance of the starting point is that it takes the surrounding
200    /// context into consideration. For example, the `\A` anchor can only
201    /// match when `start == 0`.
202    ///
203    /// `matches` must have a length that is at least the number of regexes
204    /// in this set.
205    ///
206    /// This method returns true if and only if at least one member of
207    /// `matches` is true after executing the set against `text`.
208    #[doc(hidden)]
209    pub fn read_matches_at(
210        &self,
211        matches: &mut [bool],
212        text: $text_ty,
213        start: usize,
214    ) -> bool {
215        self.0.searcher().many_matches_at(matches, $as_bytes(text), start)
216    }
217
218    /// Returns the total number of regular expressions in this set.
219    pub fn len(&self) -> usize {
220        self.0.regex_strings().len()
221    }
222
223    /// Returns `true` if this set contains no regular expressions.
224    pub fn is_empty(&self) -> bool {
225        self.0.regex_strings().is_empty()
226    }
227
228    /// Returns the patterns that this set will match on.
229    ///
230    /// This function can be used to determine the pattern for a match. The
231    /// slice returned has exactly as many patterns givens to this regex set,
232    /// and the order of the slice is the same as the order of the patterns
233    /// provided to the set.
234    ///
235    /// # Example
236    ///
237    /// ```rust
238    /// # use regex::RegexSet;
239    /// let set = RegexSet::new(&[
240    ///     r"\w+",
241    ///     r"\d+",
242    ///     r"\pL+",
243    ///     r"foo",
244    ///     r"bar",
245    ///     r"barfoo",
246    ///     r"foobar",
247    /// ]).unwrap();
248    /// let matches: Vec<_> = set
249    ///     .matches("foobar")
250    ///     .into_iter()
251    ///     .map(|match_idx| &set.patterns()[match_idx])
252    ///     .collect();
253    /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
254    /// ```
255    pub fn patterns(&self) -> &[String] {
256        self.0.regex_strings()
257    }
258}
259
260/// A set of matches returned by a regex set.
261#[derive(Clone, Debug)]
262pub struct SetMatches {
263    matched_any: bool,
264    matches: Vec<bool>,
265}
266
267impl SetMatches {
268    /// Whether this set contains any matches.
269    pub fn matched_any(&self) -> bool {
270        self.matched_any
271    }
272
273    /// Whether the regex at the given index matched.
274    ///
275    /// The index for a regex is determined by its insertion order upon the
276    /// initial construction of a `RegexSet`, starting at `0`.
277    ///
278    /// # Panics
279    ///
280    /// If `regex_index` is greater than or equal to `self.len()`.
281    pub fn matched(&self, regex_index: usize) -> bool {
282        self.matches[regex_index]
283    }
284
285    /// The total number of regexes in the set that created these matches.
286    pub fn len(&self) -> usize {
287        self.matches.len()
288    }
289
290    /// Returns an iterator over indexes in the regex that matched.
291    ///
292    /// This will always produces matches in ascending order of index, where
293    /// the index corresponds to the index of the regex that matched with
294    /// respect to its position when initially building the set.
295    pub fn iter(&self) -> SetMatchesIter<'_> {
296        SetMatchesIter((&*self.matches).into_iter().enumerate())
297    }
298}
299
300impl IntoIterator for SetMatches {
301    type IntoIter = SetMatchesIntoIter;
302    type Item = usize;
303
304    fn into_iter(self) -> Self::IntoIter {
305        SetMatchesIntoIter(self.matches.into_iter().enumerate())
306    }
307}
308
309impl<'a> IntoIterator for &'a SetMatches {
310    type IntoIter = SetMatchesIter<'a>;
311    type Item = usize;
312
313    fn into_iter(self) -> Self::IntoIter {
314        self.iter()
315    }
316}
317
318/// An owned iterator over the set of matches from a regex set.
319///
320/// This will always produces matches in ascending order of index, where the
321/// index corresponds to the index of the regex that matched with respect to
322/// its position when initially building the set.
323#[derive(Debug)]
324pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>);
325
326impl Iterator for SetMatchesIntoIter {
327    type Item = usize;
328
329    fn next(&mut self) -> Option<usize> {
330        loop {
331            match self.0.next() {
332                None => return None,
333                Some((_, false)) => {}
334                Some((i, true)) => return Some(i),
335            }
336        }
337    }
338
339    fn size_hint(&self) -> (usize, Option<usize>) {
340        self.0.size_hint()
341    }
342}
343
344impl DoubleEndedIterator for SetMatchesIntoIter {
345    fn next_back(&mut self) -> Option<usize> {
346        loop {
347            match self.0.next_back() {
348                None => return None,
349                Some((_, false)) => {}
350                Some((i, true)) => return Some(i),
351            }
352        }
353    }
354}
355
356impl iter::FusedIterator for SetMatchesIntoIter {}
357
358/// A borrowed iterator over the set of matches from a regex set.
359///
360/// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
361///
362/// This will always produces matches in ascending order of index, where the
363/// index corresponds to the index of the regex that matched with respect to
364/// its position when initially building the set.
365#[derive(Clone, Debug)]
366pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>);
367
368impl<'a> Iterator for SetMatchesIter<'a> {
369    type Item = usize;
370
371    fn next(&mut self) -> Option<usize> {
372        loop {
373            match self.0.next() {
374                None => return None,
375                Some((_, &false)) => {}
376                Some((i, &true)) => return Some(i),
377            }
378        }
379    }
380
381    fn size_hint(&self) -> (usize, Option<usize>) {
382        self.0.size_hint()
383    }
384}
385
386impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
387    fn next_back(&mut self) -> Option<usize> {
388        loop {
389            match self.0.next_back() {
390                None => return None,
391                Some((_, &false)) => {}
392                Some((i, &true)) => return Some(i),
393            }
394        }
395    }
396}
397
398impl<'a> iter::FusedIterator for SetMatchesIter<'a> {}
399
400#[doc(hidden)]
401impl From<Exec> for RegexSet {
402    fn from(exec: Exec) -> Self {
403        RegexSet(exec)
404    }
405}
406
407impl fmt::Debug for RegexSet {
408    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
409        write!(f, "RegexSet({:?})", self.0.regex_strings())
410    }
411}
412
413#[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() }
414#[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text }
415        }
416    }
417}
418
419define_set! {
420    unicode,
421    set_unicode,
422    &str,
423    as_bytes_str,
424/// ```rust
425/// # use regex::RegexSet;
426/// let set = RegexSet::new(&[
427///     r"[a-z]+@[a-z]+\.(com|org|net)",
428///     r"[a-z]+\.(com|org|net)",
429/// ]).unwrap();
430///
431/// // Ask whether any regexes in the set match.
432/// assert!(set.is_match("foo@example.com"));
433///
434/// // Identify which regexes in the set match.
435/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
436/// assert_eq!(vec![0, 1], matches);
437///
438/// // Try again, but with text that only matches one of the regexes.
439/// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
440/// assert_eq!(vec![1], matches);
441///
442/// // Try again, but with text that doesn't match any regex in the set.
443/// let matches: Vec<_> = set.matches("example").into_iter().collect();
444/// assert!(matches.is_empty());
445/// ```
446}
447
448define_set! {
449    bytes,
450    set_bytes,
451    &[u8],
452    as_bytes_bytes,
453/// ```rust
454/// # use regex::bytes::RegexSet;
455/// let set = RegexSet::new(&[
456///     r"[a-z]+@[a-z]+\.(com|org|net)",
457///     r"[a-z]+\.(com|org|net)",
458/// ]).unwrap();
459///
460/// // Ask whether any regexes in the set match.
461/// assert!(set.is_match(b"foo@example.com"));
462///
463/// // Identify which regexes in the set match.
464/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
465/// assert_eq!(vec![0, 1], matches);
466///
467/// // Try again, but with text that only matches one of the regexes.
468/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
469/// assert_eq!(vec![1], matches);
470///
471/// // Try again, but with text that doesn't match any regex in the set.
472/// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
473/// assert!(matches.is_empty());
474/// ```
475}